load libraries
INDEX Fig1. a. (left) 394 (right) 227
Figure 1. a. (left) 394, stats 337 (right) 227 b. 459, stats 337 c. 2591, stats 2647 d. 2730 e. 2430
Figure 2. a. 679 stats, 585 b. 1212, stats 1119 c. 2591, stats 2647 d. 2430 e. 2972, stats 2972 f. right 2998, stats 2998; left 3021, stats 2998
Figure S2. a. 896 b. 1372 c. 974
Figure S3. a. 1172, stats 1172 b. 1267 c. 863
Figure S4. a. 1412 b. 1474 c. 1571 d. NA e. 2050
Figure 3. a. 1912, stats 1847 b. left 2756, right 2851, stats 2851
Figure 4. a. 2709, stats 2647 b. 2730 c. 2430, stats 2549
Figure 5. a. 2647, stats 2690 b. 2506, 3183 c. 2765, 3154 d. left 3135, stats 3135; right 3106, stats 3106
Figure 6. a. left 3396, right 3396 b. 3396, stats 3396 c. 3396, stats 3396 d. 3531, 3591 e. 3481, 3522
PRS**************************************
Input the PRS data
# Read the PRS data
prs_data <- read.table('/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/PRS_array_for_manuscript.txt', header = TRUE, sep = '\t', stringsAsFactors = FALSE)
Set color palette
group_colors <- c("#3C8C78","grey")
Define your list of traits to analyze
# List of traits to analyze
traits <- c("Heart_rate", "LQTS", "PR_interval", "QTc", "Afib", "HRV", "BP", "QRS", "HCM", "LVEDV","LVEDVI","LVEF","LVESV","LVESVI","SV","SVI","GGE","Focal_Epilepsy")
traits
[1] "Heart_rate" "LQTS" "PR_interval" "QTc"
[5] "Afib" "HRV" "BP" "QRS"
[9] "HCM" "LVEDV" "LVEDVI" "LVEF"
[13] "LVESV" "LVESVI" "SV" "SVI"
[17] "GGE" "Focal_Epilepsy"
prs_data <- prs_data %>%
mutate(across(all_of(traits), ~ scale(.x) %>% as.numeric, .names = "Normalized_{.col}"))
group_definitions <- list(
"Control" = c(1),
"Case"= c(2,3,4,5,6)
)
Format the data
# Function to determine the group for a given arrest ontology value
get_group <- function(arrest_ontology_value) {
for (group_name in names(group_definitions)) {
if (arrest_ontology_value %in% group_definitions[[group_name]]) {
return(group_name)
}
}
return(as.character(arrest_ontology_value))
}
# Categorize arrest_ontology into specified groups
prs_data <- prs_data %>%
rowwise() %>%
mutate(arrest_group = get_group(arrest_ontology))
# Filter out only the relevant groups for the plot
relevant_groups <- names(group_definitions)
filtered_data <- prs_data %>%
filter(arrest_group %in% relevant_groups)
# Generate normalized column names based on the traits list
normalized_traits <- paste("Normalized", traits, sep = "_")
# Filter out non-existing columns from the normalized_traits
existing_normalized_traits <- normalized_traits[normalized_traits %in% names(filtered_data)]
# Generate normalized column names based on the traits list
normalized_traits <- paste("Normalized", traits, sep = "_")
# Reshape the filtered data to long format for the normalized traits
melted_data_normalized <- melt(filtered_data,
id.vars = c('arrest_group'),
measure.vars = existing_normalized_traits)
Split into discovery or replication
prs_data_discovery <- prs_data %>%
filter(Set == "Discovery")
prs_data_replication <- prs_data %>%
filter(Set == "Replication")
Make the Corrplot
# Select only the relevant columns for correlation
data_for_correlation <- prs_data_discovery[, normalized_traits]
#order the traits
corr_traits <- c("Normalized_BP", "Normalized_Heart_rate", "Normalized_HRV", "Normalized_PR_interval", "Normalized_QTc", "Normalized_QRS",
"Normalized_LVEDV", "Normalized_LVEDVI", "Normalized_LVEF", "Normalized_LVESV", "Normalized_LVESVI", "Normalized_SV", "Normalized_SVI",
"Normalized_LQTS", "Normalized_Afib", "Normalized_HCM", "Normalized_GGE", "Normalized_Focal_Epilepsy")
# Ensure the DataFrame is ordered according to trait preferences
data_for_correlation_ordered <- data_for_correlation[, corr_traits]
# Calculate correlation matrix
cor_matrix_ordered <- cor(data_for_correlation_ordered, use = "complete.obs")
#plot correlation
corrplot(cor_matrix_ordered,
method = "circle",
type = "full",
tl.col = "black",
tl.srt = 45,
cl.ratio = 0.2,
col = colorRampPalette(c("#05618F", "white", "#F0BE3C"))(200),
diag = FALSE)
Test for ANOVA
check_anova_assumptions <- function(data, trait) {
# Ensure 'arrest_group' is a factor
data$arrest_group <- as.factor(data$arrest_group)
# Fit the ANOVA model
formula <- as.formula(paste(trait, "~ arrest_group"))
anova_model <- aov(formula, data = data)
# Extract residuals
residuals <- residuals(anova_model)
# Shapiro-Wilk test for normality
shapiro_test <- shapiro.test(residuals)
shapiro_p_value <- shapiro_test$p.value
# Levene's Test for homogeneity of variances
library(car)
levene_test <- leveneTest(formula, data = data)
levene_p_value <- levene_test$`Pr(>F)`[1]
# Bartlett's Test for homogeneity of variances
bartlett_test <- bartlett.test(formula, data = data)
bartlett_p_value <- bartlett_test$p.value
# Create a summary table with the test results
data.frame(
Trait = trait,
Shapiro_Wilk_p_value = shapiro_p_value,
Levene_p_value = levene_p_value,
Bartlett_p_value = bartlett_p_value
)
}
anova_assumptions_results <- lapply(traits, function(trait) check_anova_assumptions(filtered_data, trait))
Loading required package: carData
Attaching package: 'car'
The following object is masked from 'package:dplyr':
recode
# Combine the results into a single data frame
anova_assumptions_df <- do.call(rbind, anova_assumptions_results)
# Print the results
print(anova_assumptions_df)
Trait Shapiro_Wilk_p_value Levene_p_value Bartlett_p_value
1 Heart_rate 5.271165e-05 5.681662e-01 1.285170e-01
2 LQTS 3.162300e-01 2.617585e-01 2.361534e-01
3 PR_interval 6.446775e-02 8.649432e-01 9.542401e-01
4 QTc 2.262405e-08 6.712154e-01 6.782251e-01
5 Afib 8.268008e-01 3.494958e-01 4.902742e-01
6 HRV 5.551286e-14 2.877576e-01 1.825408e-01
7 BP 1.241454e-01 3.471054e-01 2.514376e-01
8 QRS 9.731434e-43 8.101722e-06 6.553831e-21
9 HCM 1.785614e-03 7.176570e-01 8.548699e-01
10 LVEDV 4.580060e-02 1.187708e-03 5.521181e-03
11 LVEDVI 3.143288e-02 1.322513e-02 4.080442e-02
12 LVEF 4.456790e-03 1.406317e-01 2.420456e-01
13 LVESV 7.247199e-03 7.239969e-01 9.165433e-01
14 LVESVI 7.166318e-06 4.876455e-01 1.697590e-01
15 SV 6.753341e-07 6.827648e-01 8.366959e-01
16 SVI 3.751355e-04 9.185243e-01 7.460529e-01
17 GGE 2.658597e-02 7.995791e-01 6.491467e-01
18 Focal_Epilepsy 3.652661e-03 5.061986e-01 3.745069e-01
Since some do not meet ANOVA criteria, we go with nonparametric
perform_kruskal_wallis <- function(data, trait) {
formula <- as.formula(paste(trait, "~ arrest_group"))
kruskal_test_result <- kruskal.test(formula, data = data)
# Extract p-value
p_value <- kruskal_test_result$p.value
# Create a summary table with the p-value
data.frame(Trait = trait,
Kruskal_Wallis_p_value = p_value)
}
perform_dunn_test <- function(filtered_data, trait) {
# Extract trait values and group assignments
trait_values <- filtered_data[[trait]]
groups <- filtered_data$arrest_group
# Perform Dunn's test
dunn_test_result <- dunn.test(x = trait_values, g = groups, method = "bonferroni")
# Extract comparison, test statistic, p-value, and significance
comparisons <- dunn_test_result$comparisons
Z_values <- dunn_test_result$Z
p_values <- dunn_test_result$P.adjusted
significant <- p_values < 0.05
# Combine into a data frame
dunn_test_df <- data.frame(Comparison = comparisons,
Z_value = Z_values,
P_value = p_values,
Significant = significant)
dunn_test_df$Trait <- trait
dunn_test_df
}
# Perform Kruskal-Wallis test for each trait and store results
kruskal_results <- lapply(traits, function(trait) perform_kruskal_wallis(filtered_data, trait))
# Combine Kruskal-Wallis results into a single data frame
combined_kruskal_results <- do.call(rbind, kruskal_results)
# Print Kruskal-Wallis results
print("Kruskal-Wallis Test Results:")
[1] "Kruskal-Wallis Test Results:"
print(combined_kruskal_results)
Trait Kruskal_Wallis_p_value
1 Heart_rate 4.382702e-03
2 LQTS 6.797534e-01
3 PR_interval 1.580135e-01
4 QTc 4.795650e-03
5 Afib 2.026287e-01
6 HRV 6.138060e-01
7 BP 1.279428e-05
8 QRS 7.181042e-08
9 HCM 6.125892e-01
10 LVEDV 3.302545e-01
11 LVEDVI 9.615777e-02
12 LVEF 5.040997e-04
13 LVESV 1.741015e-03
14 LVESVI 4.664870e-01
15 SV 5.332224e-02
16 SVI 5.284722e-04
17 GGE 3.575482e-01
18 Focal_Epilepsy 5.808578e-01
# Perform Dunn's test for each trait and store results
dunn_results <- lapply(traits, function(trait) perform_dunn_test(filtered_data, trait))
Kruskal-Wallis rank sum test
data: trait_values and groups
Kruskal-Wallis chi-squared = 8.118, df = 1, p-value = 0
Comparison of trait_values by groups
(Bonferroni)
Col Mean-|
Row Mean | Case
---------+-----------
Control | -2.849216
| 0.0022*
alpha = 0.05
Reject Ho if p <= alpha/2
Kruskal-Wallis rank sum test
data: trait_values and groups
Kruskal-Wallis chi-squared = 0.1704, df = 1, p-value = 0.68
Comparison of trait_values by groups
(Bonferroni)
Col Mean-|
Row Mean | Case
---------+-----------
Control | -0.412799
| 0.3399
alpha = 0.05
Reject Ho if p <= alpha/2
Kruskal-Wallis rank sum test
data: trait_values and groups
Kruskal-Wallis chi-squared = 1.9931, df = 1, p-value = 0.16
Comparison of trait_values by groups
(Bonferroni)
Col Mean-|
Row Mean | Case
---------+-----------
Control | 1.411784
| 0.0790
alpha = 0.05
Reject Ho if p <= alpha/2
Kruskal-Wallis rank sum test
data: trait_values and groups
Kruskal-Wallis chi-squared = 7.9549, df = 1, p-value = 0
Comparison of trait_values by groups
(Bonferroni)
Col Mean-|
Row Mean | Case
---------+-----------
Control | -2.820448
| 0.0024*
alpha = 0.05
Reject Ho if p <= alpha/2
Kruskal-Wallis rank sum test
data: trait_values and groups
Kruskal-Wallis chi-squared = 1.6233, df = 1, p-value = 0.2
Comparison of trait_values by groups
(Bonferroni)
Col Mean-|
Row Mean | Case
---------+-----------
Control | -1.274097
| 0.1013
alpha = 0.05
Reject Ho if p <= alpha/2
Kruskal-Wallis rank sum test
data: trait_values and groups
Kruskal-Wallis chi-squared = 0.2547, df = 1, p-value = 0.61
Comparison of trait_values by groups
(Bonferroni)
Col Mean-|
Row Mean | Case
---------+-----------
Control | 0.504648
| 0.3069
alpha = 0.05
Reject Ho if p <= alpha/2
Kruskal-Wallis rank sum test
data: trait_values and groups
Kruskal-Wallis chi-squared = 19.041, df = 1, p-value = 0
Comparison of trait_values by groups
(Bonferroni)
Col Mean-|
Row Mean | Case
---------+-----------
Control | 4.363594
| 0.0000*
alpha = 0.05
Reject Ho if p <= alpha/2
Kruskal-Wallis rank sum test
data: trait_values and groups
Kruskal-Wallis chi-squared = 29.0153, df = 1, p-value = 0
Comparison of trait_values by groups
(Bonferroni)
Col Mean-|
Row Mean | Case
---------+-----------
Control | 5.386581
| 0.0000*
alpha = 0.05
Reject Ho if p <= alpha/2
Kruskal-Wallis rank sum test
data: trait_values and groups
Kruskal-Wallis chi-squared = 0.2564, df = 1, p-value = 0.61
Comparison of trait_values by groups
(Bonferroni)
Col Mean-|
Row Mean | Case
---------+-----------
Control | -0.506381
| 0.3063
alpha = 0.05
Reject Ho if p <= alpha/2
Kruskal-Wallis rank sum test
data: trait_values and groups
Kruskal-Wallis chi-squared = 0.9479, df = 1, p-value = 0.33
Comparison of trait_values by groups
(Bonferroni)
Col Mean-|
Row Mean | Case
---------+-----------
Control | 0.973601
| 0.1651
alpha = 0.05
Reject Ho if p <= alpha/2
Kruskal-Wallis rank sum test
data: trait_values and groups
Kruskal-Wallis chi-squared = 2.7681, df = 1, p-value = 0.1
Comparison of trait_values by groups
(Bonferroni)
Col Mean-|
Row Mean | Case
---------+-----------
Control | -1.663773
| 0.0481
alpha = 0.05
Reject Ho if p <= alpha/2
Kruskal-Wallis rank sum test
data: trait_values and groups
Kruskal-Wallis chi-squared = 12.1004, df = 1, p-value = 0
Comparison of trait_values by groups
(Bonferroni)
Col Mean-|
Row Mean | Case
---------+-----------
Control | -3.478568
| 0.0003*
alpha = 0.05
Reject Ho if p <= alpha/2
Kruskal-Wallis rank sum test
data: trait_values and groups
Kruskal-Wallis chi-squared = 9.8043, df = 1, p-value = 0
Comparison of trait_values by groups
(Bonferroni)
Col Mean-|
Row Mean | Case
---------+-----------
Control | 3.131186
| 0.0009*
alpha = 0.05
Reject Ho if p <= alpha/2
Kruskal-Wallis rank sum test
data: trait_values and groups
Kruskal-Wallis chi-squared = 0.5303, df = 1, p-value = 0.47
Comparison of trait_values by groups
(Bonferroni)
Col Mean-|
Row Mean | Case
---------+-----------
Control | 0.728206
| 0.2332
alpha = 0.05
Reject Ho if p <= alpha/2
Kruskal-Wallis rank sum test
data: trait_values and groups
Kruskal-Wallis chi-squared = 3.7338, df = 1, p-value = 0.05
Comparison of trait_values by groups
(Bonferroni)
Col Mean-|
Row Mean | Case
---------+-----------
Control | -1.932301
| 0.0267
alpha = 0.05
Reject Ho if p <= alpha/2
Kruskal-Wallis rank sum test
data: trait_values and groups
Kruskal-Wallis chi-squared = 12.0124, df = 1, p-value = 0
Comparison of trait_values by groups
(Bonferroni)
Col Mean-|
Row Mean | Case
---------+-----------
Control | 3.465893
| 0.0003*
alpha = 0.05
Reject Ho if p <= alpha/2
Kruskal-Wallis rank sum test
data: trait_values and groups
Kruskal-Wallis chi-squared = 0.8465, df = 1, p-value = 0.36
Comparison of trait_values by groups
(Bonferroni)
Col Mean-|
Row Mean | Case
---------+-----------
Control | 0.920046
| 0.1788
alpha = 0.05
Reject Ho if p <= alpha/2
Kruskal-Wallis rank sum test
data: trait_values and groups
Kruskal-Wallis chi-squared = 0.3048, df = 1, p-value = 0.58
Comparison of trait_values by groups
(Bonferroni)
Col Mean-|
Row Mean | Case
---------+-----------
Control | -0.552132
| 0.2904
alpha = 0.05
Reject Ho if p <= alpha/2
# Combine Dunn test results into a single data frame
combined_dunn_results <- do.call(rbind, dunn_results)
# Print Dunn Test results
print("Dunn's Test Post-Hoc Results:")
[1] "Dunn's Test Post-Hoc Results:"
print(combined_dunn_results)
Comparison Z_value P_value Significant Trait
1 Case - Control -2.8492167 2.191351e-03 TRUE Heart_rate
2 Case - Control -0.4127996 3.398767e-01 FALSE LQTS
3 Case - Control 1.4117842 7.900676e-02 FALSE PR_interval
4 Case - Control -2.8204490 2.397825e-03 TRUE QTc
5 Case - Control -1.2740978 1.013144e-01 FALSE Afib
6 Case - Control 0.5046481 3.069030e-01 FALSE HRV
7 Case - Control 4.3635942 6.397140e-06 TRUE BP
8 Case - Control 5.3865814 3.590521e-08 TRUE QRS
9 Case - Control -0.5063811 3.062946e-01 FALSE HCM
10 Case - Control 0.9736014 1.651272e-01 FALSE LVEDV
11 Case - Control -1.6637732 4.807888e-02 TRUE LVEDVI
12 Case - Control -3.4785685 2.520498e-04 TRUE LVEF
13 Case - Control 3.1311866 8.705075e-04 TRUE LVESV
14 Case - Control 0.7282069 2.332435e-01 FALSE LVESVI
15 Case - Control -1.9323020 2.666112e-02 TRUE SV
16 Case - Control 3.4658937 2.642361e-04 TRUE SVI
17 Case - Control 0.9200469 1.787741e-01 FALSE GGE
18 Case - Control -0.5521322 2.904289e-01 FALSE Focal_Epilepsy
# Define the groups
metrics <- c("Heart_rate", "PR_interval", "QTc", "HRV", "BP", "QRS","LVEDV", "LVEDVI","LVEF","LVESV","LVESVI","SV","SVI")
syndromes <- c("LQTS", "Afib", "HCM","GGE","Focal_Epilepsy")
# Categorize each trait
combined_dunn_results$Category <- ifelse(combined_dunn_results$Trait %in% metrics, "Metrics",
ifelse(combined_dunn_results$Trait %in% syndromes, "Syndromes", "Combined"))
# Reorder the levels of Trait based on the Category
combined_dunn_results <- combined_dunn_results %>%
mutate(Trait = factor(Trait, levels = unique(Trait[order(Category)])))
# Transform P-values to -log10 scale
combined_dunn_results$NegLogP <- -log10(combined_dunn_results$P_value)
# Calculate the -log10(0.05) for the reference line
threshold <- -log10(0.05)
# Create a new variable for coloring based on P-value significance
combined_dunn_results$Significant <- ifelse(combined_dunn_results$P_value < 0.05, "Significant", "Not Significant")
# Number of unique comparisons
num_comparisons <- length(unique(combined_dunn_results$Comparison))
# Combine metrics and syndromes into a single ordered list
ordered_traits <- c("BP", "Heart_rate", "HRV", "PR_interval", "QTc","QRS",
"LVEDV", "LVEDVI", "LVEF", "LVESV", "LVESVI", "SV", "SVI",
"LQTS", "Afib", "HCM", "GGE", "Focal_Epilepsy","WT_global")
# Update the 'Trait' column to have this specific order
combined_dunn_results$Trait <- factor(combined_dunn_results$Trait,
levels = ordered_traits)
# Replot with the correct order
p3 <- ggplot(combined_dunn_results, aes(x = Trait, y = NegLogP, color = -Z_value)) +
geom_segment(aes(xend = Trait, yend = 0), size = 1) +
geom_point(size = 3) +
geom_hline(yintercept = threshold, linetype = "dotted", color = "Black") +
scale_color_gradient2(low = "black", mid = "white", high = "#3C8C78", midpoint = 0) +
facet_wrap(~Comparison, scales = "free_x") +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
strip.text.x = element_text(size = 10)) +
labs(y = "-log10(P-value)", x = "Trait", color = "Z-value", title = "Lollipop Plot of -log10(P-values) by Trait and Group Comparison")
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
generated.
# Display the plots
print(p3)
# Function to generate plots for a given trait
generate_plots_for_trait <- function(data, trait, group_colors) {
# Rank the scores for the specified trait
data$rank <- rank(data[[trait]], na.last = "keep")
# Density plot for the trait
density_plot <- ggplot(data, aes(x = rank, fill = arrest_group, y = ..density..)) +
geom_density(alpha = 0.6) +
scale_fill_manual(values = group_colors) +
labs(title = paste("Relative Density of Overall", trait, "Trait Ranks Across Arrest Groups"),
x = paste("Overall Rank of", trait, "Scores"),
y = "Relative Density") +
theme_cowplot(12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# CDF plot for the trait
cdf_plot <- ggplot(data, aes(x = rank, color = arrest_group)) +
stat_ecdf(geom = "step", size = 1) +
scale_color_manual(values = group_colors) +
labs(title = paste("CDF of Overall", trait, "Trait Ranks Across Arrest Groups"),
x = paste("Overall Rank of", trait, "Scores"),
y = "Cumulative Proportion") +
theme_cowplot(12) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Combine the plots
combined_plot <- plot_grid(density_plot, cdf_plot, ncol = 1, align = 'v', labels = c("A", "B"))
# Return the combined plot
return(combined_plot)
}
# List of traits to plot
traits <- c("BP", "Afib", "QRS","LVEF","Heart_rate")
# Initialize an empty list to store plots
plots_list <- list()
# Loop through each trait and generate plots, storing them in the list
for(trait in traits) {
plots_list[[trait]] <- generate_plots_for_trait(prs_data, trait, group_colors)
}
Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(density)` instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
generated.
######## Access each plot by its trait name from the plots_list ##########
# For example, to print the plot for QRS, use:
print(plots_list[["QRS"]])
print(plots_list[["Heart_rate"]])
Coding******************************************************************************************************************
coding_stats <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/statistics_by_individual_replication_for_manuscript.txt", header = TRUE, sep = "\t")
coding_stats_discovery <- coding_stats[coding_stats$Replication == 0, ]
Categorize the data
categorized_coding_stats_discovery <- coding_stats_discovery %>%
mutate(Category_Group = case_when(
Category == 1 ~ "Control",
Category %in% 2:6 ~ "Case"
))
Merge some of the column data to get the desired AF intervals
categorized_coding_stats_discovery <- categorized_coding_stats_discovery %>%
mutate(Epilepsy_01_0001 = (Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001),
Epilepsy_00001 = (Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton),
Epilepsy_total_missense = (Epilepsy_missense_common +Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001 + Epilepsy_missense_variant_0.00001 +
Epilepsy_missense_singleton))
categorized_coding_stats_discovery <- categorized_coding_stats_discovery %>%
mutate(Cardiomyopathy_01_00001 = (Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001),
Cardiomyopathy_00001 = (Cardiomyopathy_missense_variant_0.00001 +Cardiomyopathy_missense_singleton),
Cardiomyopathy_total_missense = (Cardiomyopathy_missense_common + Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001 +
Cardiomyopathy_missense_variant_0.00001 +Cardiomyopathy_missense_singleton))
#clean it up to remove inadvertent NAs or infs
categorized_coding_stats_discovery <- categorized_coding_stats_discovery %>%
dplyr::select(-ID, -Category) %>%
na.omit() %>%
dplyr::filter_all(all_vars(!is.infinite(.)))
# Aggregate the Data to compute mean and standard deviation for each numeric column
collapsed_coding_stats <- categorized_coding_stats_discovery %>%
group_by(Category_Group) %>%
summarise(across(where(is.numeric), list(mean = ~mean(., na.rm = TRUE),
std = ~sd(., na.rm = TRUE))))
# clean it up to remove inadvertent NAs
collapsed_coding_stats <- na.omit(collapsed_coding_stats)
Perform T-test for unequal variance
# Pull variables to analyze
coding_variables_to_analyze <- c(
"Cardiomyopathy_HIGH", "Cardiomyopathy_start_lost", "Cardiomyopathy_stop_gained",
"Cardiomyopathy_total_missense", "Cardiomyopathy_01_00001", "Cardiomyopathy_00001",
"PLP_Cardiomyopathy", "Epilepsy_HIGH", "Epilepsy_start_lost", "Epilepsy_stop_gained",
"Epilepsy_total_missense", "Epilepsy_01_0001", "Epilepsy_00001",
"PLP_Epilepsy", "variant_count"
)
ttest_results <- list()
# Loop through each variable
for (var in coding_variables_to_analyze) {
if (is.numeric(categorized_coding_stats_discovery[[var]])) {
categorized_coding_stats_discovery$Category_Group <- as.factor(categorized_coding_stats_discovery$Category_Group)
# Fit the ANOVA model to get residuals
model <- aov(reformulate("Category_Group", response = var), data = categorized_coding_stats_discovery)
residuals <- residuals(model)
# Check if residuals are not constant
if (length(unique(residuals)) > 1) {
# Levene's Test for homogeneity of variances
levene_test <- leveneTest(reformulate("Category_Group", response = var), data = categorized_coding_stats_discovery)
levene_p_value <- levene_test$`Pr(>F)`[1]
if (levene_p_value > 0.05) {
# Perform t-test if homogeneity of variances assumption is met
ttest <- t.test(reformulate("Category_Group", response = var), data = categorized_coding_stats_discovery, var.equal = TRUE)
} else {
# Perform Welch's t-test if homogeneity assumption is not met
ttest <- t.test(reformulate("Category_Group", response = var), data = categorized_coding_stats_discovery, var.equal = FALSE)
}
ttest_results[[var]] <- broom::tidy(ttest)
} else {
# If residuals are constant, perform Welch's t-test
ttest <- t.test(reformulate("Category_Group", response = var), data = categorized_coding_stats_discovery, var.equal = FALSE)
ttest_results[[var]] <- broom::tidy(ttest)
}
} else {
ttest_results[[var]] <- NA
}
}
# Print t-test results
print("t-test Results:")
[1] "t-test Results:"
print(bind_rows(ttest_results, .id = "Variable"))
# A tibble: 15 × 11
Variable estimate estimate1 estimate2 statistic p.value parameter conf.low
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Cardiomyo… 1.41e+0 2.77e+0 1.36e+0 5.64 2.74e- 8 575. 9.17e-1
2 Cardiomyo… 3.39e-2 3.95e-2 5.59e-3 3.25 1.22e- 3 610. 1.34e-2
3 Cardiomyo… 3.25e-1 3.72e-1 4.66e-2 5.01 7.46e- 7 525. 1.98e-1
4 Cardiomyo… -5.48e+0 1.06e+2 1.11e+2 -4.95 8.74e- 7 1041 -7.66e+0
5 Cardiomyo… -1.23e+0 1.32e+1 1.44e+1 -3.16 1.65e- 3 1041 -2.00e+0
6 Cardiomyo… 4.83e-1 1.50e+0 1.02e+0 2.52 1.19e- 2 576. 1.07e-1
7 PLP_Cardi… 1.70e-1 2.41e-1 7.08e-2 7.23 1.11e-12 791. 1.24e-1
8 Epilepsy_… 8.57e-1 6.26e+0 5.41e+0 2.70 7.13e- 3 633. 2.34e-1
9 Epilepsy_… 1.00e-2 1.19e-2 1.86e-3 1.94 5.34e- 2 653. -1.45e-4
10 Epilepsy_… 3.13e-1 3.95e-1 8.19e-2 3.96 8.63e- 5 530. 1.58e-1
11 Epilepsy_… -1.42e+0 6.73e+1 6.87e+1 -2.45 1.45e- 2 962. -2.56e+0
12 Epilepsy_… -1.09e+0 8.71e+0 9.80e+0 -4.59 4.95e- 6 1041 -1.56e+0
13 Epilepsy_… 5.16e-1 1.55e+0 1.03e+0 2.12 3.43e- 2 543. 3.82e-2
14 PLP_Epile… 1.33e-2 6.92e-2 5.59e-2 0.874 3.83e- 1 1041 -1.66e-2
15 variant_c… -2.82e+3 1.14e+5 1.17e+5 -4.74 2.41e- 6 1039. -3.99e+3
# ℹ 3 more variables: conf.high <dbl>, method <chr>, alternative <chr>
Create the Function to plot each column as a ratio to the means from Group 1
plot_column_ratio <- function(data, col_name) {
# Calculate the mean and standard deviation for Control
group1_mean <- mean(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
group1_sd <- sd(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
# Calculate the ratio for each group relative to Group 1
data_summary <- data %>%
group_by(Category_Group) %>%
summarise(mean_value = mean(.data[[col_name]], na.rm = TRUE),
sd_value = sd(.data[[col_name]], na.rm = TRUE),
n = n()) %>%
mutate(ratio = mean_value / group1_mean,
sem_value = sd_value / sqrt(n),
sem_ratio = sem_value / group1_mean, # SEM scaled to the group 1, just like the mean value
ci_lower = ratio - (1.96 * sem_ratio), # Lower bound of the CI
ci_upper = ratio + (1.96 * sem_ratio)) # Upper bound of the CI
return(list(summary_data = data_summary))
}
Plot the data relative to group 1
# Initialize an empty dataframe to store summary data
combined_coding_stats_summary_df <- data.frame(Category_Group = character(), col_name = character(), mean = numeric(), std = numeric(), stringsAsFactors = FALSE)
# List of all columns to plot
columns_to_plot <- setdiff(names(categorized_coding_stats_discovery), c("ID", "Category", "Category_Group"))
# Loop through each column and plot relative to Category_Group1 (Crontrol)
for (col in columns_to_plot) {
plot_data <- plot_column_ratio(categorized_coding_stats_discovery, col)
# Append summary data to combined_coding_stats_summary_df
combined_coding_stats_summary_df <- bind_rows(combined_coding_stats_summary_df, mutate(plot_data$summary_data, col_name = col))
}
Plot the data relative to group in a better way
# Subset the data for the specified variables
selected_coding_data <- combined_coding_stats_summary_df %>%
filter(col_name %in% c("variant_count",
"Cardiomyopathy_total_missense",
"Cardiomyopathy_01_00001",
"Cardiomyopathy_00001",
"Epilepsy_total_missense",
"Epilepsy_01_0001",
"Epilepsy_00001",
"PLP_Cardiomyopathy",
"PLP_Epilepsy",
"Cardiomyopathy_start_lost",
"Cardiomyopathy_stop_gained",
"Cardiomyopathy_HIGH",
"Epilepsy_start_lost",
"Epilepsy_HIGH",
"Epilepsy_stop_gained"
),
Category_Group != "Control")
# Define the specific order for "Epilepsy" and CMAR variants
levels_order <- c("variant_count",
"PLP_Epilepsy",
"Epilepsy_00001",
"Epilepsy_01_0001",
"Epilepsy_total_missense",
"Epilepsy_start_lost",
"Epilepsy_stop_gained",
"Epilepsy_HIGH",
"PLP_Cardiomyopathy",
"Cardiomyopathy_00001",
"Cardiomyopathy_01_00001",
"Cardiomyopathy_total_missense",
"Cardiomyopathy_start_lost",
"Cardiomyopathy_stop_gained",
"Cardiomyopathy_HIGH"
)
selected_coding_data$col_name <- factor(selected_coding_data$col_name, levels = levels_order)
# Plot
coding_stats_plot <- ggplot(selected_coding_data, aes(y = col_name, x = ratio, color = Category_Group)) +
geom_point(position = position_dodge(width = 1), size = 3) +
geom_errorbar(aes(xmin = ratio - sem_ratio, xmax = ratio + sem_ratio), position = position_dodge(width = 0.5), width = 1) +
geom_vline(xintercept = 1, linetype = "dashed") +
scale_color_manual(values = "#3C8C78") +
labs(title = "Ratio Compared to Control +/-SEM",
y = "Variant",
x = "Ratio to Control Mean",
color = "Category Group") +
theme_minimal() +
theme(axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8, hjust = 1),
axis.title.x = element_text(size = 8),
axis.title.y = element_text(size = 8),
legend.title = element_text(size = 8),
legend.text = element_text(size = 8),
plot.title = element_text(size = 8, hjust = 0.5),
plot.subtitle = element_text(size = 8),
plot.caption = element_text(size = 8),
plot.margin = margin(15, 15, 15, 15)) +
scale_x_continuous(limits = c(-2, 10))
print(coding_stats_plot)
Input the data
# Pull in the gene data
gene_data <- read.csv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/individual_variants_by_gene_for_manuscript.csv")
# Read the cohort data
cohort <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SFRN_cohort_for_manuscript.txt", sep = "\t", header = TRUE)
# add the arrest category to each
gene_data$Category <- cohort$arrest_ontology[match(gene_data$ID, cohort$CGM_id)]
gene_data_discovery <- gene_data %>%
filter(Set == "Discovery")
Summarize the data by GENE, counting the number of variants for each gene Filter for Category > 1 and then group and summarize
variants_per_gene_Cat_2_6 <- gene_data_discovery %>%
filter(Category > 1) %>%
group_by(GENE) %>%
summarise(
HIGH = sum(HIGH, na.rm = TRUE),
PLP = sum(PLP, na.rm = TRUE),
.groups = 'drop'
)
# Print the result
print(variants_per_gene_Cat_2_6)
# A tibble: 361 × 3
GENE HIGH PLP
<chr> <int> <int>
1 A2ML1 57 0
2 AARS 12 0
3 ABAT 0 0
4 ABCC9 4 0
5 ACADVL 5 2
6 ACTC1 1 0
7 ACTL6B 0 0
8 ACTN2 15 0
9 ADAM22 3 0
10 ADSL 1 1
# ℹ 351 more rows
Filter for Category == 1 and then group and summarize
variants_per_gene_Cat_1 <- gene_data_discovery %>%
filter(Category == 1) %>%
group_by(GENE) %>%
summarise(
HIGH = sum(HIGH, na.rm = TRUE),
PLP = sum(PLP, na.rm = TRUE),
.groups = 'drop'
)
# Print the result
print(variants_per_gene_Cat_1)
# A tibble: 361 × 3
GENE HIGH PLP
<chr> <int> <int>
1 A2ML1 69 0
2 AARS 0 0
3 ABAT 0 0
4 ABCC9 3 0
5 ACADVL 8 3
6 ACTC1 0 0
7 ACTL6B 0 2
8 ACTN2 1 0
9 ADAM22 0 0
10 ADSL 0 0
# ℹ 351 more rows
Load gene lists from text files
genes_CMAR1 <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_CMAR1_list.txt")
genes_CMAR2 <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_CMAR2_list.txt")
genes_EIEE_OMIM <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_EIEE_OMIM_list.txt")
genes_Epilepsy <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SDY_Epilepsy_list.txt")
Annotate gene with source panel
genes_CMAR1 <- data.frame(Gene = genes_CMAR1, Source = "Cardiomyopathy")
genes_CMAR2 <- data.frame(Gene = genes_CMAR2, Source = "Cardiomyopathy")
genes_EIEE_OMIM <- data.frame(Gene = genes_EIEE_OMIM, Source = "Epilepsy")
genes_Epilepsy <- data.frame(Gene = genes_Epilepsy, Source = "Epilepsy")
# Replace "\"AARS1\"" with "AARS"
genes_EIEE_OMIM$Gene <- ifelse(genes_EIEE_OMIM$Gene == "\"AARS1\"", "AARS", genes_EIEE_OMIM$Gene)
# Replace "\"KIAA2022\"" with "NEXMIF"
genes_Epilepsy$Gene <- ifelse(genes_Epilepsy$Gene == "\"NEXMIF\"", "KIAA2022", genes_Epilepsy$Gene)
Append panel source to the gene_data
# Combine all lists into one dataframe
all_genes <- rbind(genes_CMAR1, genes_CMAR2, genes_EIEE_OMIM, genes_Epilepsy)
# Sort by Gene and Source to ensure "Cardiomyopathy" comes first
all_genes <- all_genes[order(all_genes$Gene, all_genes$Source), ]
# Remove duplicates, keeping the first occurrence
all_genes <- all_genes[!duplicated(all_genes$Gene), ]
# Clean the 'Gene' column in 'all_genes' by removing quotes and backslashes
all_genes$Gene <- gsub('\"', '', all_genes$Gene)
# Ensure that the 'GENE' column in 'gene_data_discovery' and 'Gene' in 'all_genes' are character type
gene_data_discovery$GENE <- as.character(gene_data_discovery$GENE)
all_genes$Gene <- as.character(all_genes$Gene)
# find the index of each gene in 'all_genes' and use this index to assign the corresponding 'Source' to a new column in 'gene_data'
gene_data_discovery$Source <- all_genes$Source[match(gene_data_discovery$GENE, all_genes$Gene)]
# Now, 'gene_data' will have a new column 'Source' with the source of each gene
# based on the lookup from 'all_genes'
# View the first few rows to verify the new 'Source' has been added
head(gene_data_discovery)
ID GENE HIGH PLP Set Category Source
1 CGM0000001 A2ML1 0 0 Discovery 1 Cardiomyopathy
2 CGM0000001 AARS 0 0 Discovery 1 Epilepsy
3 CGM0000001 ABAT 0 0 Discovery 1 Epilepsy
4 CGM0000001 ABCC9 0 0 Discovery 1 Cardiomyopathy
5 CGM0000001 ACADVL 0 0 Discovery 1 Cardiomyopathy
6 CGM0000001 ACTC1 0 0 Discovery 1 Cardiomyopathy
Add the source to the category 1 variants list
variants_per_gene_Cat_1$Source <- all_genes$Source[match(variants_per_gene_Cat_1$GENE, all_genes$Gene)]
head(variants_per_gene_Cat_1)
# A tibble: 6 × 4
GENE HIGH PLP Source
<chr> <int> <int> <chr>
1 A2ML1 69 0 Cardiomyopathy
2 AARS 0 0 Epilepsy
3 ABAT 0 0 Epilepsy
4 ABCC9 3 0 Cardiomyopathy
5 ACADVL 8 3 Cardiomyopathy
6 ACTC1 0 0 Cardiomyopathy
Add the source to the category 2-6 variants list
variants_per_gene_Cat_2_6$Source <- all_genes$Source[match(variants_per_gene_Cat_2_6$GENE, all_genes$Gene)]
head(variants_per_gene_Cat_2_6)
# A tibble: 6 × 4
GENE HIGH PLP Source
<chr> <int> <int> <chr>
1 A2ML1 57 0 Cardiomyopathy
2 AARS 12 0 Epilepsy
3 ABAT 0 0 Epilepsy
4 ABCC9 4 0 Cardiomyopathy
5 ACADVL 5 2 Cardiomyopathy
6 ACTC1 1 0 Cardiomyopathy
combine the variants together now
# Add a new column to indicate the category
variants_per_gene_Cat_1 <- variants_per_gene_Cat_1 %>%
mutate(Category = "1")
# Add a new column to indicate the category range
variants_per_gene_Cat_2_6 <- variants_per_gene_Cat_2_6 %>%
mutate(Category = "2-6")
# Combine the two datasets
combined_variants <- bind_rows(variants_per_gene_Cat_1, variants_per_gene_Cat_2_6)
# Print the combined dataset
print(combined_variants)
# A tibble: 722 × 5
GENE HIGH PLP Source Category
<chr> <int> <int> <chr> <chr>
1 A2ML1 69 0 Cardiomyopathy 1
2 AARS 0 0 Epilepsy 1
3 ABAT 0 0 Epilepsy 1
4 ABCC9 3 0 Cardiomyopathy 1
5 ACADVL 8 3 Cardiomyopathy 1
6 ACTC1 0 0 Cardiomyopathy 1
7 ACTL6B 0 2 Epilepsy 1
8 ACTN2 1 0 Cardiomyopathy 1
9 ADAM22 0 0 Epilepsy 1
10 ADSL 0 0 Epilepsy 1
# ℹ 712 more rows
# Filter dataset where High > 0
combined_variants_High <- combined_variants %>% filter(HIGH > 0)
# Create a label function
custom_labeller <- function(variable, value) {
if (variable == "Category") {
return(ifelse(value == "1", "Control", "Case"))
}
return(value)
}
# Create the plot with custom labeller
High_variant_plot <- ggplot(combined_variants_High, aes(x = GENE, y = Category, fill = HIGH)) +
geom_tile() +
scale_fill_gradientn(colors = c("#3C8C78", "#dcb43c", "#ae7e46"),
values = scales::rescale(c(0, 0.5, 1))) +
facet_wrap(~ Source, scales = "free_x", ncol = 1) +
labs(title = "High Variants Heatmap",
x = "Gene",
y = "Category",
fill = "Count") +
theme_cowplot(12) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
strip.background = element_blank(),
strip.placement = "outside") +
scale_y_discrete(labels = function(x) custom_labeller("Category", x))
# Print the plot
print(High_variant_plot)
# Create a custom labeller function for PLP_plot
custom_labeller_plp <- function(variable, value) {
if (variable == "Category") {
return(ifelse(value == "1", "Control", "Case"))
}
return(value)
}
# Filter datasets where PLP > 0
combined_variants_PLP <- combined_variants %>% filter(PLP > 0)
# Create the PLP plot with custom labeller
PLP_plot <- ggplot(combined_variants_PLP, aes(x = GENE, y = Category, fill = PLP)) +
geom_tile() +
scale_fill_gradientn(colors = c("#3C8C78", "#dcb43c", "#ae7e46"),
values = scales::rescale(c(0, 0.5, 1))) +
facet_wrap(~ Source, scales = "free_x", ncol = 1) +
labs(title = "PLP Heatmap",
x = "Gene",
y = "Category",
fill = "Count") +
theme_cowplot(12) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
strip.background = element_blank(),
strip.placement = "outside") +
scale_y_discrete(labels = function(x) custom_labeller_plp("Category", x))
# Print the plot
print(PLP_plot)
Now bring in the module data
modules <- read_excel("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/cardio_gene_sets.xlsx")
annotate the gene_data wiht the modules
module_data <- left_join(gene_data_discovery, modules, by = c("GENE" = "Gene"))
# View the first few rows of the updated data
head(module_data)
ID GENE HIGH PLP Set Category Source
1 CGM0000001 A2ML1 0 0 Discovery 1 Cardiomyopathy
2 CGM0000001 AARS 0 0 Discovery 1 Epilepsy
3 CGM0000001 ABAT 0 0 Discovery 1 Epilepsy
4 CGM0000001 ABCC9 0 0 Discovery 1 Cardiomyopathy
5 CGM0000001 ACADVL 0 0 Discovery 1 Cardiomyopathy
6 CGM0000001 ACTC1 0 0 Discovery 1 Cardiomyopathy
Module
1 Rasopathy
2 <NA>
3 <NA>
4 Membrane_polarity
5 mitochondria
6 sarcomere
NOW PLOT THE PLPs by their expert-defined categories. The size is the relative number of variants per gene in the category. the Color is the absolute number of variants
#Filter NAs ans aggregate the data
module_data <- module_data %>%
filter(!is.na(Module))
module_data <- module_data %>%
mutate(Category_Group = ifelse(Category == 1, "Category 1", "Categories 2-6"))
#count up the variants
module_summary <- module_data %>%
group_by(Module, Category_Group) %>%
summarise(Total_PLP_variant = sum(PLP, na.rm = TRUE)) %>%
ungroup()
`summarise()` has grouped output by 'Module'. You can override using the
`.groups` argument.
# First, calculate the number of genes per module
genes_per_module <- modules %>%
group_by(Module) %>%
summarise(Num_Genes = n()) %>%
ungroup()
# Merge this information with your module_summary data frame
module_summary <- module_summary %>%
left_join(genes_per_module, by = c("Module" = "Module"))
# Calculate the size relative to the number of genes per module
module_summary <- module_summary %>%
mutate(Relative_Size = Total_PLP_variant / Num_Genes)
# Filter the data to only include "Categories 2-6"
module_summary_filtered <- module_summary %>%
filter(Category_Group == "Categories 2-6")
module_order <- module_summary_filtered %>%
group_by(Module) %>%
summarise(Sort_Metric = mean(Total_PLP_variant, na.rm = TRUE)) %>%
arrange(desc(Sort_Metric)) %>%
.$Module
module_summary_filtered$Module <- factor(module_summary_filtered$Module, levels = module_order)
# Now plot with the reordered Module
modules_plot <- ggplot(module_summary_filtered, aes(x = Module, y = Category_Group, size = Total_PLP_variant, color = Relative_Size)) +
geom_point(shape = 15, alpha = 1) +
scale_size(range = c(3, 20), name = "Number of PLPs") +
scale_color_gradient(low = "Grey", high = "#05618F", name = "# of PLPs relative to Module size") +
theme_minimal() +
labs(title = "Total PLP by Module (Cases)",
x = "Module",
y = "") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(modules_plot)
#Count the High effect variants
aggregated_data <- module_data %>%
group_by(ID, Module, Category) %>%
summarise(High_count = sum(HIGH),
.groups = 'drop')
#split off controls
data_cat_1 <- aggregated_data %>%
filter(Category %in% c(1))
#identify IDs with any 'High_count' greater than 0
ids_with_both_1 <- data_cat_1 %>%
group_by(ID) %>%
summarise(High_count_any = any(High_count > 0)) %>%
filter(High_count_any) %>%
dplyr::select(ID)
# Filter to include only rows that have IDs with High_count > 0
modules_with_both_1 <- data_cat_1 %>%
semi_join(ids_with_both_1, by = "ID")
#find all possible pairs of modules where 'High_count' is greater than 0
# and generate a new df with each row representing a unique pair of modules for each ID.
module_pairs_for_ids_1 <- modules_with_both_1 %>%
group_by(ID) %>%
do({
data.frame(t(combn(unique(.$Module), 2)), stringsAsFactors = FALSE)
}) %>%
dplyr::rename(Module1 = X1, Module2 = X2) %>%
left_join(modules_with_both_1, by = c("ID", "Module1" = "Module")) %>%
left_join(modules_with_both_1, by = c("ID", "Module2" = "Module"), suffix = c("_1", "_2")) %>%
filter(High_count_2 > 0 | High_count_1 > 0)
# Count occurrences of module pairs across IDs
module_pair_counts_1 <- module_pairs_for_ids_1 %>%
group_by(Module1, Module2) %>%
summarise(Count = n_distinct(ID), .groups = 'drop')
#split off cases
data_cat_2_6 <- aggregated_data %>%
filter(Category %in% c(2,3,4,5,6))
#identify IDs with any 'High_count' greater than 0
ids_with_both_2_6 <- data_cat_2_6 %>%
group_by(ID) %>%
summarise(Missense_any = any(High_count > 0)) %>%
filter(Missense_any) %>%
dplyr::select(ID)
# Filter to include only rows that have IDs with High_count > 0
modules_with_both_2_6 <- data_cat_2_6 %>%
semi_join(ids_with_both_2_6, by = "ID")
#find all possible pairs of modules where 'High_count' is greater than 0
# and generate a new df with each row representing a unique pair of modules for each ID.
module_pairs_for_ids_2_6 <- modules_with_both_2_6 %>%
group_by(ID) %>%
do({
data.frame(t(combn(unique(.$Module), 2)), stringsAsFactors = FALSE)
}) %>%
dplyr::rename(Module1 = X1, Module2 = X2) %>%
left_join(modules_with_both_2_6, by = c("ID", "Module1" = "Module")) %>%
left_join(modules_with_both_2_6, by = c("ID", "Module2" = "Module"), suffix = c("_1", "_2")) %>%
filter(High_count_2 > 0 | High_count_1 > 0)
# Count occurrences of module pairs across IDs
module_pair_counts_2_6 <- module_pairs_for_ids_2_6 %>%
group_by(Module1, Module2) %>%
summarise(Count = n_distinct(ID), .groups = 'drop')
# Ensure all combinations are present with at least a zero count for each category
comparison_df <- merge(module_pair_counts_1, module_pair_counts_2_6, by = c("Module1", "Module2"), all = TRUE)
comparison_df[is.na(comparison_df)] <- 0 # Replace NA with 0
# Rename columns
names(comparison_df)[names(comparison_df) == "Count.x"] <- "Count_1"
names(comparison_df)[names(comparison_df) == "Count.y"] <- "Count_2_6"
# Add number of people per group for totals not in counts
comparison_df$Not_Count_1 <- sum(cohort$arrest_ontology == "1") - comparison_df$Count_1
comparison_df$Not_Count_2_6 <- sum(cohort$arrest_ontology %in% c(2,3,4,5,6)) - comparison_df$Count_2_6
# Function to perform Fisher's Exact Test for a row
perform_fisher_test <- function(count_1, not_count_1, count_2_6, not_count_2_6) {
contingency_table <- matrix(c(count_1, not_count_1, count_2_6, not_count_2_6),
nrow = 2,
dimnames = list(c("In Count", "Not in Count"), c("Group_1", "Group_2_6")))
fisher_result <- fisher.test(contingency_table)
return(fisher_result$p.value)
}
# Apply the Fisher function to each row in the dataframe
comparison_df$p_value <- mapply(perform_fisher_test,
comparison_df$Count_1,
comparison_df$Not_Count_1,
comparison_df$Count_2_6,
comparison_df$Not_Count_2_6)
comparison_df$adjusted_p_value <- p.adjust(comparison_df$p_value, method = "bonferroni", n = length(comparison_df$p_value))
# Filter significant results based on adjusted p-values
significant_pairs <- comparison_df %>% filter(adjusted_p_value < 0.05)
# Print significant module pairs
print(significant_pairs)
Module1 Module2 Count_1 Count_2_6 Not_Count_1
1 DGC fate_specification 5 31 591
2 DGC ICD 28 61 568
3 DGC Membrane_polarity 14 44 582
4 DGC nuclear_envelope 2 18 594
5 DGC Proteostasis 4 25 592
6 DGC sarcomere 164 216 432
7 DGC Z_disc 11 35 585
8 fate_specification lysosomal_storage 6 27 590
9 fate_specification nuclear_envelope 3 26 593
10 fate_specification sarcomere 164 226 432
11 ICD fate_specification 29 67 567
12 ICD Membrane_polarity 38 78 558
13 ICD Proteostasis 28 63 568
14 ICD sarcomere 180 239 416
15 ICD Z_disc 35 72 561
16 lysosomal_storage sarcomere 165 215 431
17 Membrane_polarity fate_specification 15 50 581
18 Membrane_polarity lysosomal_storage 15 40 581
19 Membrane_polarity nuclear_envelope 12 37 584
20 Membrane_polarity Proteostasis 14 45 582
21 Membrane_polarity sarcomere 171 229 425
22 Membrane_polarity Z_disc 21 54 575
23 mitochondria sarcomere 173 222 423
24 nuclear_envelope sarcomere 162 214 434
25 Proteostasis fate_specification 5 34 591
26 Proteostasis nuclear_envelope 2 17 594
27 Proteostasis sarcomere 162 220 434
28 Proteostasis Z_disc 11 36 585
29 Z_disc fate_specification 12 44 584
30 Z_disc sarcomere 166 220 430
Not_Count_2_6 p_value adjusted_p_value
1 542 4.031980e-06 3.669102e-04
2 512 1.510745e-04 1.374778e-02
3 529 2.256059e-05 2.053014e-03
4 555 1.554685e-04 1.414764e-02
5 548 3.749909e-05 3.412417e-03
6 357 2.261894e-04 2.058324e-02
7 538 2.235224e-04 2.034054e-02
8 546 1.283913e-04 1.168361e-02
9 547 4.819768e-06 4.385989e-04
10 347 1.803934e-05 1.641580e-03
11 506 2.585037e-05 2.352384e-03
12 495 3.524117e-05 3.206946e-03
13 510 7.019329e-05 6.387589e-03
14 334 4.336333e-05 3.946063e-03
15 501 7.010650e-05 6.379692e-03
16 358 3.674011e-04 3.343350e-02
17 523 2.792575e-06 2.541243e-04
18 533 2.937979e-04 2.673561e-02
19 536 1.939491e-04 1.764936e-02
20 528 1.395707e-05 1.270093e-03
21 344 5.971621e-05 5.434175e-03
22 519 3.894246e-05 3.543764e-03
23 351 5.261606e-04 4.788062e-02
24 359 2.177828e-04 1.981824e-02
25 539 6.695766e-07 6.093147e-05
26 556 2.916050e-04 2.653605e-02
27 353 4.930944e-05 4.487159e-03
28 537 1.392532e-04 1.267204e-02
29 529 6.591986e-06 5.998707e-04
30 353 1.455038e-04 1.324084e-02
#merge the high effect count data
High_data <- rbind(data_cat_1, data_cat_2_6)
#assign the case/control categories
High_data <- High_data %>%
mutate(Category_group = case_when(
Category == 1 ~ "1",
TRUE ~ "2-6"
))
#assign in the module gene count data
df_gene_counts <- data.frame(
Module = c("CICR", "DGC", "ICD", "Membrane_polarity", "Proteostasis",
"Rasopathy", "SNS_PNS", "Z_disc", "cytokine",
"fate_specification", "lysosomal_storage", "mitochondria",
"nuclear_envelope", "sarcomere"),
Num_Genes = c(11, 7, 12, 24, 7, 17, 10, 12, 4, 11, 3, 25, 4, 16)
)
# scale the high effect date per module based on the genes in the module
High_data_scaled <- High_data %>%
left_join(df_gene_counts, by = "Module")
High_data_scaled <- High_data_scaled %>%
mutate(High_count_per_gene = High_count / Num_Genes)
#compute the means
averages_sem_scaled <- High_data_scaled %>%
group_by(Module, Category_group) %>%
summarize(
Mean = mean(High_count_per_gene),
SD = sd(High_count_per_gene),
N = n(),
SEM = SD / sqrt(N),
.groups = 'drop'
)
#run the t-tests to compare the modules per case or control (Category_group)
test_results <- High_data_scaled %>%
group_by(Module) %>%
do({
ttest <- t.test(High_count_per_gene ~ Category_group, data = .)
tidy(ttest)
}) %>%
ungroup()
# show t-test data
print(test_results)
# A tibble: 14 × 11
Module estimate estimate1 estimate2 statistic p.value parameter conf.low
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 CICR -0.0145 0.0193 0.0338 -2.28 2.31e-2 920. -0.0270
2 DGC -0.00427 0.000532 0.00480 -3.07 2.24e-3 584. -0.00700
3 ICD -0.00785 0.00714 0.0150 -2.14 3.29e-2 865. -0.0151
4 Membrane_p… -0.00698 0.00101 0.00799 -3.90 1.09e-4 534. -0.0105
5 Proteostas… -0.00511 0.000532 0.00565 -3.06 2.32e-3 559. -0.00840
6 Rasopathy -0.00212 0.00800 0.0101 -1.51 1.32e-1 996. -0.00487
7 SNS_PNS -0.00561 0.00447 0.0101 -1.90 5.75e-2 616. -0.0114
8 Z_disc -0.00519 0.00140 0.00659 -3.24 1.28e-3 596. -0.00834
9 cytokine 0.0141 0.100 0.0860 1.73 8.40e-2 1005. -0.00190
10 fate_speci… -0.00848 0.000508 0.00898 -3.39 7.63e-4 519. -0.0134
11 lysosomal_… -0.00143 0.00186 0.00329 -0.788 4.31e-1 937. -0.00500
12 mitochondr… -0.00157 0.00112 0.00269 -2.11 3.52e-2 701. -0.00303
13 nuclear_en… -0.00296 0 0.00296 -1.90 5.77e-2 505 -0.00603
14 sarcomere -0.0414 0.0244 0.0658 -6.10 1.89e-9 582. -0.0547
# ℹ 3 more variables: conf.high <dbl>, method <chr>, alternative <chr>
#calculate averages
averages_sem_scaled <- High_data_scaled %>%
group_by(Module, Category_group) %>%
summarize(
Mean = mean(High_count_per_gene),
SD = sd(High_count_per_gene),
N = n(),
SEM = SD / sqrt(N),
.groups = 'drop'
)
# label function for the fill legend
custom_fill_labels <- c("1" = "Control", "2-6" = "Case")
# Plot
High_modules_plot_scaled <- ggplot(averages_sem_scaled, aes(x = Module, y = Mean, fill = Category_group, group = Category_group)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.8), color = "black") +
geom_errorbar(aes(ymin = Mean - SEM, ymax = Mean + SEM), width = .2, position = position_dodge(.8)) +
scale_fill_manual(values = c("1" = "#FF6464", "2-6" = "#05618F"), labels = custom_fill_labels) +
labs(x = "Module", y = "Average High Count Per Gene") +
theme_cowplot() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Show plot
print(High_modules_plot_scaled)
#compute the relative counts based on cohort size
comparison_df <- comparison_df %>%
mutate(Relative_Counts = (Count_2_6 / (sum(cohort$arrest_ontology %in% c(2,3,4,5,6))))/(Count_1 / (sum(cohort$arrest_ontology == "1"))))
#assign in the module gene count data
df_gene_counts <- data.frame(
Module = c("CICR", "DGC", "ICD", "Membrane_polarity", "Proteostasis",
"Rasopathy", "SNS_PNS", "Z_disc", "cytokine",
"fate_specification", "lysosomal_storage", "mitochondria",
"nuclear_envelope", "sarcomere"),
Num_Genes = c(11, 7, 12, 24, 7, 17, 10, 12, 4, 11, 3, 25, 4, 16)
)
significant_edges_df <- comparison_df %>%
filter(adjusted_p_value < 0.05)
# Create an igraph object with edge attributes using filtered dataframe
network_graph <- graph_from_data_frame(d = significant_edges_df[, c("Module1", "Module2")], directed = FALSE)
# Matching Num_Genes from df_gene_counts to vertices in the network_graph
V(network_graph)$Num_Genes <- df_gene_counts$Num_Genes[match(V(network_graph)$name, df_gene_counts$Module)]
# Add edge attributes for adjusted_p_value and Relative_Counts from the filtered data frame
E(network_graph)$adjusted_p_value <- significant_edges_df$adjusted_p_value
E(network_graph)$Relative_Counts <- significant_edges_df$Relative_Counts
# Plot the graph with ggraph
HIGH_network <- ggraph(network_graph, layout = 'fr') +
geom_edge_link(aes(color = -log10(adjusted_p_value), edge_width = Relative_Counts), alpha = 0.8) +
geom_node_point(aes(size = Num_Genes), color = "Black") +
geom_node_text(aes(label = name), repel = TRUE, point.padding = unit(0.01, "lines")) +
scale_size_continuous(range = c(3, 10)) +
theme_graph()
print(HIGH_network)
Warning: The `trans` argument of `continuous_scale()` is deprecated as of ggplot2 3.5.0.
ℹ Please use the `transform` argument instead.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
generated.
#Filter NAs ans aggregate the data
module_data <- module_data %>%
filter(!is.na(Module))
module_data <- module_data %>%
mutate(Category_Group = ifelse(Category == 1, "Category 1", "Categories 2-6"))
#count up the variants
module_summary <- module_data %>%
group_by(Module, Category_Group) %>%
summarise(Total_HIGH_variant = sum(HIGH, na.rm = TRUE)) %>%
ungroup()
`summarise()` has grouped output by 'Module'. You can override using the
`.groups` argument.
# First, calculate the number of genes per module
genes_per_module <- modules %>%
group_by(Module) %>%
summarise(Num_Genes = n()) %>%
ungroup()
# Merge this information with your module_summary data frame
module_summary <- module_summary %>%
left_join(genes_per_module, by = c("Module" = "Module"))
# Calculate the size relative to the number of genes per module
module_summary <- module_summary %>%
mutate(Relative_Size = Total_HIGH_variant / Num_Genes)
# Filter the data to only include "Categories 2-6"
module_summary_filtered <- module_summary %>%
filter(Category_Group == "Categories 2-6")
#order them by mean count
module_order <- module_summary_filtered %>%
group_by(Module) %>%
summarise(Sort_Metric = mean(Total_HIGH_variant, na.rm = TRUE)) %>%
arrange(desc(Sort_Metric)) %>%
.$Module
module_summary_filtered$Module <- factor(module_summary_filtered$Module, levels = module_order)
# Now plot with the reordered Module
modules_HIGH_plot <- ggplot(module_summary_filtered, aes(x = Module, y = Category_Group, size = Total_HIGH_variant, color = Relative_Size)) +
geom_point(shape = 15, alpha = 1) +
scale_size(range = c(3, 20), name = "Number of HIGHs") +
scale_color_gradient(low = "Grey", high = "#05618F", name = "# of HIGHs relative to Module size") +
theme_minimal() +
labs(title = "Total HIGH by Module (Cases)",
x = "Module",
y = "") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
print(modules_HIGH_plot)
Pull in uniprot data and plot PLP variants
# pull down data for MYH7 (uniprot ID P12883)
MYH7_data <- drawProteins::get_features("P12883")
[1] "Download has worked"
MYH7_data <- drawProteins::feature_to_dataframe(MYH7_data)
# pull down data for MYBPC3 (uniprot ID Q14896)
MYBPC3_data <- drawProteins::get_features("Q14896")
[1] "Download has worked"
MYBPC3_data <- drawProteins::feature_to_dataframe(MYBPC3_data)
# pull down data for SCN5A (uniprot ID Q14524)
SCN5A_data <- drawProteins::get_features("Q14524")
[1] "Download has worked"
SCN5A_data <- drawProteins::feature_to_dataframe(SCN5A_data)
MYH7_plot <- draw_canvas(MYH7_data)
MYH7_plot <- draw_chains(MYH7_plot, MYH7_data)
MYH7_plot <- draw_domains(MYH7_plot, MYH7_data)
MYH7_plot <- draw_regions(MYH7_plot, MYH7_data)
#MYH7_plot <- draw_phospho(MYH7_plot, MYH7_data)
variants <- data.frame(
begin = c(1712,1057,908,904,869,741,723,576,369,355),
y = rep(1, 10)
)
# Now, add these points to your plot
MYH7_plot <- MYH7_plot +
geom_point(data = variants, aes(x = begin, y = y), color = "red", size = 3)
# Adjust themes and aesthetics as needed
MYH7_plot <- MYH7_plot + theme_bw(base_size = 14) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
panel.border = element_blank())
MYH7_plot
##################################################################################################
MYBPC3_plot <- draw_canvas(MYBPC3_data)
MYBPC3_plot <- draw_chains(MYBPC3_plot, MYBPC3_data)
MYBPC3_plot <- draw_domains(MYBPC3_plot, MYBPC3_data)
MYBPC3_plot <- draw_regions(MYBPC3_plot, MYBPC3_data)
#MYBPC3_plot <- draw_phospho(MYBPC3_plot, MYBPC3_data)
#######Note, 7 variants are not shown becasue they are splice variants
variants <- data.frame(
begin = c(1098, 1096, 1064, 791, 542, 502, 495, 258, 219),
y = rep(1, 9)
)
# Now, add these points to your plot
MYBPC3_plot <- MYBPC3_plot +
geom_point(data = variants, aes(x = begin, y = y), color = "red", size = 3)
# Adjust themes and aesthetics as needed
MYBPC3_plot <- MYBPC3_plot + theme_bw(base_size = 14) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
panel.border = element_blank())
MYBPC3_plot
##################################################################################################
SCN5A_plot <- draw_canvas(SCN5A_data)
SCN5A_plot <- draw_chains(SCN5A_plot, SCN5A_data)
SCN5A_plot <- draw_domains(SCN5A_plot, SCN5A_data)
SCN5A_plot <- draw_regions(SCN5A_plot, SCN5A_data)
#SCN5A_plot <- draw_phospho(SCN5A_plot, SCN5A_data)
#######Note, 2 variants are not shown becasue they are splice variants
variants <- data.frame(
begin = c(1784,1595,1319,1316,845,828,814),
y = rep(1, 7)
)
# Now, add these points to your plot
SCN5A_plot <- SCN5A_plot +
geom_point(data = variants, aes(x = begin, y = y), color = "red", size = 3)
# Adjust themes as needed
SCN5A_plot <- SCN5A_plot + theme_bw(base_size = 14) +
theme(panel.grid.minor = element_blank(),
panel.grid.major = element_blank(),
axis.ticks = element_blank(),
axis.text.y = element_blank(),
panel.border = element_blank())
SCN5A_plot
Nonoding******************************************************************************************************************
Pull in the file that is just the ATAC data intervals overlaid on top of the Hi-C intervals which map to the genes of interest
# Read the PC-Hi-C ATAC overlay BED file
bed_data <- read_tsv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/SFRN_cohort_data/noncoding/noncoding_rare/EP_ATAC_intersect_final_output_nodups.bed", col_names = FALSE, col_types = cols(
X1 = col_character(), # Chromosome
X2 = col_double(), # Start Position
X3 = col_double() # End Position
))
# Calculate Interval Sizes
bed_data$interval_size <- bed_data$X3 - bed_data$X2
# Calculate the median of interval sizes
median_interval_size <- median(bed_data$interval_size, na.rm = TRUE)
# plot
interval_size_histogram <- ggplot(bed_data, aes(x = interval_size)) +
geom_histogram(binwidth = 100, fill = "lightblue", color = "black") +
geom_vline(aes(xintercept = median_interval_size), color = "black", linetype = "dashed", size = 1) +
xlab("Interval Size") +
ylab("Frequency") +
ggtitle("Histogram of Interval Sizes") +
theme_cowplot(12)
interval_size_histogram
mean(bed_data$interval_size)
[1] 497.4085
median(bed_data$interval_size)
[1] 396
Now pull in the ATAC data that maps to genes of interest, including the promoter fragment the ATAC data mapped to
# Read the BED file
bed_data <- read_tsv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/EP_ATAC_intersect_final_output_annotated_unique.bed", col_names = c("Chromosome", "Start", "End", "Chromosome2", "GeneStart", "GeneEnd", "GeneNames"), col_types = cols(
Chromosome = col_character(),
Start = col_double(),
End = col_double(),
Chromosome2 = col_character(),
GeneStart = col_double(),
GeneEnd = col_double(),
GeneNames = col_character()
))
# Split the gene names in case there are multiple genes separated by semicolon
bed_data <- bed_data %>%
mutate(GeneNames = strsplit(GeneNames, ";")) %>%
unnest(GeneNames)
# Read the gene list
gene_list <- readLines("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/Lorenzo_panel_genes_simple.txt")
# Filter bed_data to keep only rows with genes in the gene list
bed_data <- bed_data %>%
filter(GeneNames %in% gene_list)
# Count the number of regions mapped to each gene
gene_count <- bed_data %>%
count(GeneNames)
print(gene_count)
# A tibble: 347 × 2
GeneNames n
<chr> <int>
1 A2ML1 15
2 ABAT 45
3 ABCC9 35
4 ACADVL 3
5 ACTC1 9
6 ACTL6B 13
7 ACTN2 21
8 ADAM22 3
9 ADSL 35
10 AGL 59
# ℹ 337 more rows
# Calculate the median
median_regions <- median(gene_count$n, na.rm = TRUE)
# plot
regions_per_gene <- ggplot(gene_count, aes(x = n)) +
geom_histogram(binwidth = 1, fill = "#B40F20", color = "#B40F20") +
geom_vline(aes(xintercept = median_regions), color = "black", linetype = "dashed", size = 1) +
xlab("Number of Regions") +
ylab("Frequency") +
theme_cowplot(12) +
ggtitle("Histogram of the Distribution of Number of Regions per Gene")
regions_per_gene
mean(gene_count)
mean(gene_count$n)
[1] 31.40922
median(gene_count$n)
[1] 24
bed_data$region_size <- bed_data$End - bed_data$Start
# Sum region sizes for each gene
total_region_size_per_gene <- bed_data %>%
group_by(GeneNames) %>%
summarise(TotalRegionSize = sum(region_size))
mean(total_region_size_per_gene$TotalRegionSize)
[1] 16239.12
median(total_region_size_per_gene$TotalRegionSize)
[1] 12010
# Create histogram of Total Region Sizes
ggplot(total_region_size_per_gene, aes(x = TotalRegionSize)) +
geom_histogram(binwidth = 1000, fill = "blue", color = "black") +
xlab("Total Region Size") +
ylab("Frequency") +
ggtitle("Histogram of Total Region Sizes per Gene")
gene_list <- unique(bed_data$GeneNames)
# Select hg19 from Ensembl
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl", host = "grch37.ensembl.org")
Warning: Ensembl will soon enforce the use of https.
Ensure the 'host' argument includes "https://"
# Get TSS positions along with strand information
tss_info <- getBM(attributes = c('hgnc_symbol', 'chromosome_name', 'transcription_start_site', 'strand'),
filters = 'hgnc_symbol',
values = gene_list,
mart = ensembl)
# Make function to select the most upstream TSS based on strand
select_upstream_tss <- function(df) {
if (all(df$strand == 1)) {
return(data.frame(transcription_start_site = min(df$transcription_start_site))) # Forward strand
} else {
return(data.frame(transcription_start_site = max(df$transcription_start_site))) # Reverse strand
}
}
# Apply the function to each group of genes
tss_upstream <- tss_info %>%
group_by(hgnc_symbol) %>%
do(select_upstream_tss(.))
# Merge bed_data with tss_upstream on gene names
merged_data <- merge(bed_data, tss_upstream, by.x = "GeneNames", by.y = "hgnc_symbol")
# Calculate the midpoint (peak center) of each interval
merged_data$Center <- (merged_data$Start + merged_data$End) / 2
# Calculate the distance
merged_data$DistanceToTSS <- abs(merged_data$Center - merged_data$transcription_start_site)
Distance to TSS
mean(merged_data$DistanceToTSS)
[1] 264213.1
median(merged_data$DistanceToTSS)
[1] 176294
# Create a histogram
median_distance <- median(merged_data$DistanceToTSS, na.rm = TRUE)
dist_tss <- ggplot(merged_data, aes(x = DistanceToTSS)) +
geom_histogram(binwidth = 0.1, fill = "#FFB3BA", color = "black") +
geom_vline(aes(xintercept = median_distance), color = "black", linetype = "dashed", size = 1) +
xlab("Distance to TSS (bp)") +
ylab("Frequency") +
theme_cowplot(12) +
ggtitle("Histogram of Distances to Transcription Start Sites (TSS)") +
scale_x_log10(breaks = 10^(0:6) * 10000, labels = scales::comma)
dist_tss
# Retrieve TSS information for all genes along with HGNC symbols
all_tss_info <- getBM(
attributes = c('hgnc_symbol', 'chromosome_name', 'transcription_start_site', 'strand'),
mart = ensembl
)
# Filter out rows where hgnc_symbol is blank
filtered_all_tss_info <- all_tss_info %>%
filter(hgnc_symbol != "")
See if the peak is closer to another TSS than the contacted one!
# Function to select the most upstream TSS along with chromosome information based on strand, just like before, but for all genes
select_upstream_tss <- function(df) {
if (nrow(df) == 0) {
return(data.frame(chromosome_name = NA, transcription_start_site = NA)) # Return NA if there are no rows
}
if (all(df$strand == 1)) {
# Forward strand
tss <- min(df$transcription_start_site, na.rm = TRUE)
} else {
# Reverse strand
tss <- max(df$transcription_start_site, na.rm = TRUE)
}
chromosome <- df$chromosome_name[which.min(abs(df$transcription_start_site - tss))]
return(data.frame(chromosome_name = chromosome, transcription_start_site = tss))
}
# Apply the function to each group of genes
all_tss_upstream <- filtered_all_tss_info %>%
group_by(hgnc_symbol) %>%
do(select_upstream_tss(.)) %>%
ungroup()
# Prepend 'chr' to chromosome names
all_tss_upstream$chromosome_name <- paste0("chr", all_tss_upstream$chromosome_name)
# Create GRanges object for all_tss_upstream
tss_ranges <- GRanges(
seqnames = all_tss_upstream$chromosome_name,
ranges = IRanges(start = all_tss_upstream$transcription_start_site, end = all_tss_upstream$transcription_start_site)
)
# Create GRanges object for merged_data
merged_data_ranges <- GRanges(
seqnames = merged_data$Chromosome,
ranges = IRanges(start = merged_data$Center, end = merged_data$Center)
)
# Find the nearest TSS for each midpoint (peak center)
nearest_hits <- nearest(merged_data_ranges, tss_ranges)
# Extract the closest TSS information
closest_tss_info <- all_tss_upstream[nearest_hits, ]
# Add closest TSS information to merged_data
merged_data$closest_gene <- closest_tss_info$hgnc_symbol
merged_data$closest_tss <- closest_tss_info$transcription_start_site
merged_data$closest_tss_chromosome <- closest_tss_info$chromosome_name
# Calculate distance to closest TSS
merged_data$distance_to_closest_tss <- abs(merged_data$Center - merged_data$closest_tss)
# Compare GeneNames with closest_gene and append
merged_data$gene_match <- merged_data$GeneNames == merged_data$closest_gene
# Calculate the fraction of matches
fraction_match <- sum(merged_data$gene_match, na.rm = TRUE) / nrow(merged_data)
fraction_mismatch <- 1-(sum(merged_data$gene_match, na.rm = TRUE) / nrow(merged_data))
# Print the fraction
print(100*fraction_match)
[1] 13.26727
print(100*fraction_mismatch)
[1] 86.73273
Lets see which genes have regions mapped to them that are closer to other TSSs
# Aggregate data by GeneNames
gene_summary <- merged_data %>%
group_by(GeneNames) %>%
summarize(
any_true = any(gene_match, na.rm = TRUE),
any_false = any(!gene_match, na.rm = TRUE),
all_true = all(gene_match, na.rm = TRUE),
all_false = all(!gene_match, na.rm = TRUE)
) %>%
ungroup()
# Adjusted calculations to include 8 missing genes
adjusted_denominator <- nrow(gene_summary) + 16
percent_any_true <- sum(gene_summary$any_true) / adjusted_denominator * 100
number_any_true <- sum(gene_summary$any_true)
percent_any_false <- sum(gene_summary$any_false) / adjusted_denominator * 100
number_any_false <- sum(gene_summary$any_false)
percent_all_true <- sum(gene_summary$all_true) / adjusted_denominator * 100
number_all_true <- sum(gene_summary$all_true)
percent_all_false <- sum(gene_summary$all_false) / adjusted_denominator * 100
number_all_false <- sum(gene_summary$all_false)
percent_both_true_false <- sum(gene_summary$any_true & gene_summary$any_false) / adjusted_denominator * 100
number_both_true_false <- sum(gene_summary$any_true & gene_summary$any_false)
# Print the results
cat("Total genes in panel", 363 ,"\n")
Total genes in panel 363
cat("Genes with no ATAC peaks:", 100*(16/363),"%, ",16,"\n")
Genes with no ATAC peaks: 4.407713 %, 16
cat("Genes with at least one TRUE:", percent_any_true,"%, ", number_any_true,"\n")
Genes with at least one TRUE: 42.69972 %, 155
cat("Genes with at least one FALSE:", percent_any_false,"%, ", number_any_false,"\n")
Genes with at least one FALSE: 93.66391 %, 340
cat("Genes with only TRUE:", percent_all_true,"%, ", number_all_true,"\n")
Genes with only TRUE: 1.928375 %, 7
cat("Genes with only FALSE:", percent_all_false,"%, ", number_all_false,"\n")
Genes with only FALSE: 52.89256 %, 192
cat("Genes with both TRUE and FALSE:", percent_both_true_false,"%, ", number_both_true_false,"\n")
Genes with both TRUE and FALSE: 40.77135 %, 148
input the count data
noncoding_stats <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/noncoding_frequency_by_individual_for_manuscript.txt", header = TRUE, sep = "\t")
Categorize the data
categorized_noncoding_stats <- noncoding_stats %>%
mutate(Category_Group = case_when(
Category == 1 ~ "Control",
#Category %in% 2:3 ~ "2-3",
Category %in% 2:6 ~ "Case"
))
Aggregate and clean
#clean it up to remove inadvertent NAs or infs
categorized_noncoding_stats <- categorized_noncoding_stats %>%
dplyr::select(-ID, -Category) %>%
na.omit() %>%
dplyr::filter_all(all_vars(!is.infinite(.)))
# Aggregate the Data to compute mean and standard deviation for each numeric column
collapsed_noncoding_stats <- categorized_noncoding_stats %>%
group_by(Category_Group) %>%
summarise(across(where(is.numeric), list(mean = ~mean(., na.rm = TRUE),
std = ~sd(., na.rm = TRUE))))
# Remove rows with NA values
collapsed_noncoding_stats <- na.omit(collapsed_noncoding_stats)
Perform ANOVA with Tukey
# List of variables to analyze
noncoding_variables_to_analyze <- c("variant_count", "Epilepsy_gnomAD.0.001","Epilepsy_ultrarare","Cardiomyopathy_gnomAD.0.001","Cardiomyopathy_ultrarare")
ttest_results <- list()
# Loop through each variable
for (var in noncoding_variables_to_analyze) {
if (is.numeric(categorized_noncoding_stats[[var]])) {
categorized_noncoding_stats$Category_Group <- as.factor(categorized_noncoding_stats$Category_Group)
# Fit the ANOVA model to get residuals
model <- aov(reformulate("Category_Group", response = var), data = categorized_noncoding_stats)
residuals <- residuals(model)
# Check if residuals are not constant
if (length(unique(residuals)) > 1) {
# Levene's Test for homogeneity of variances
levene_test <- leveneTest(reformulate("Category_Group", response = var), data = categorized_noncoding_stats)
levene_p_value <- levene_test$`Pr(>F)`[1]
if (levene_p_value > 0.05) {
# Perform t-test if homogeneity of variances assumption is met
ttest <- t.test(reformulate("Category_Group", response = var), data = categorized_noncoding_stats, var.equal = TRUE)
} else {
# Perform Welch's t-test if homogeneity assumption is not met
ttest <- t.test(reformulate("Category_Group", response = var), data = categorized_noncoding_stats, var.equal = FALSE)
}
ttest_results[[var]] <- broom::tidy(ttest)
} else {
# If residuals are constant, perform Welch's t-test
ttest <- t.test(reformulate("Category_Group", response = var), data = categorized_noncoding_stats, var.equal = FALSE)
ttest_results[[var]] <- broom::tidy(ttest)
}
} else {
ttest_results[[var]] <- NA
}
}
# Print t-test results
print("t-test Results:")
[1] "t-test Results:"
print(bind_rows(ttest_results, .id = "Variable"))
# A tibble: 5 × 11
Variable estimate estimate1 estimate2 statistic p.value parameter conf.low
<chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 variant_co… -108. 3759. 3867. -5.54 3.82e- 8 1041. -146.
2 Epilepsy_g… -1.52 46.2 47.7 -1.96 5.02e- 2 1041 -3.04
3 Epilepsy_u… 8.62 20.5 11.9 5.15 3.75e- 7 520. 5.33
4 Cardiomyop… -4.73 38.3 43.0 -6.79 1.90e-11 1041 -6.10
5 Cardiomyop… 7.00 16.6 9.64 5.80 1.11e- 8 529. 4.63
# ℹ 3 more variables: conf.high <dbl>, method <chr>, alternative <chr>
Create the Function to plot each column as a ratio to the means from Group 1
# Function to plot each column as a ratio to the means from Group 1
plot_column_ratio <- function(data, col_name) {
# Calculate the mean and standard deviation for Group 1
group1_mean <- mean(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
group1_sd <- sd(data[data$Category_Group == "Control", col_name], na.rm = TRUE)
# Calculate the ratio for each group relative to Group 1
data_summary <- data %>%
group_by(Category_Group) %>%
summarise(mean_value = mean(.data[[col_name]], na.rm = TRUE),
sd_value = sd(.data[[col_name]], na.rm = TRUE),
n = n()) %>%
mutate(ratio = mean_value / group1_mean,
sem_value = sd_value / sqrt(n), # SEM calculation
sem_ratio = sem_value / group1_mean, # SEM ratio
ci_lower = ratio - (1.96 * sem_ratio), # Lower bound of the CI
ci_upper = ratio + (1.96 * sem_ratio)) # Upper bound of the CI
return(list(summary_data = data_summary))
}
# List of all columns to plot
columns_to_plot <- setdiff(names(categorized_noncoding_stats), c("ID", "Category", "Category_Group"))
# Initialize an empty dataframe to store summary data
combined_noncoding_stats_summary_df <- data.frame(Category_Group = character(), col_name = character(), mean =
numeric(), std = numeric(), stringsAsFactors = FALSE)
# Loop through each column and plot relative to Category_Group1
for (col in columns_to_plot) {
plot_data <- plot_column_ratio(categorized_noncoding_stats, col)
# Append summary data to combined_noncoding_stats_summary_df
combined_noncoding_stats_summary_df <- bind_rows(combined_noncoding_stats_summary_df, mutate(plot_data$summary_data, col_name = col))
}
# Subset the data for the specified variables
selected_noncoding_data <- combined_noncoding_stats_summary_df %>%
filter(col_name %in% c(
"variant_count",
"Cardiomyopathy_gnomAD.0.001",
"Epilepsy_gnomAD.0.001",
"Cardiomyopathy_ultrarare",
"Epilepsy_ultrarare"),
Category_Group != "Control")
# Define the specific order for noncoding variants
levels_order <- c(
"variant_count",
"Epilepsy_ultrarare",
"Epilepsy_gnomAD.0.001",
"Cardiomyopathy_ultrarare",
"Cardiomyopathy_gnomAD.0.001"
)
# Factorize the 'col_name' column with the specified order
selected_noncoding_data$col_name <- factor(selected_noncoding_data$col_name, levels = levels_order)
# Plot
noncoding_stats_plot <- ggplot(selected_noncoding_data, aes(y = col_name, x = ratio, color = Category_Group)) +
geom_point(position = position_dodge(width = 1), size = 3) +
geom_errorbar(aes(xmin = ratio - sem_ratio, xmax = ratio + sem_ratio), position = position_dodge(width = 0.5), width = 0.5) +
geom_vline(xintercept = 1, linetype = "dashed") +
scale_color_manual(values = "#FF6464") +
labs(title = "Ratio Compared to Group 1 +/-SEM",
y = "Variant",
x = "Ratio to Group 1 Mean",
color = "Category Group") +
theme_minimal() +
theme(axis.text.x = element_text(size = 8),
axis.text.y = element_text(size = 8, hjust = 1),
axis.title.x = element_text(size = 8),
axis.title.y = element_text(size = 8),
legend.title = element_text(size = 8),
legend.text = element_text(size = 8),
plot.title = element_text(size = 8, hjust = 0.5),
plot.subtitle = element_text(size = 8),
plot.caption = element_text(size = 8),
plot.margin = margin(15, 15, 15, 15)) +
scale_x_continuous(limits = c(0.75, 2))
print(noncoding_stats_plot)
Input the data (change to your path)
# Pull in the gene data
gene_data_noncoding <- read.csv("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/individual_noncoding_variants_by_gene_for_manuscript.csv")
# Read the cohort data
cohort <- read.table("/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/SFRN_cohort_for_manuscript.txt", sep = "\t", header = TRUE)
# add the arrest category to each
gene_data_noncoding$Category <- cohort$arrest_ontology[match(gene_data_noncoding$ID, cohort$CGM_id)]
#filter for discovery only
gene_data_noncoding_discovery <- gene_data_noncoding %>%
filter(Set == "Discovery")
Summarize the data by GENE, counting the number of variants for each gene Filter for Category > 1 and then group and summarize
variants_per_gene_Cat_2_6 <- gene_data_noncoding_discovery %>%
filter(Category > 1) %>%
group_by(GENE) %>%
summarise(
noncoding_ultrarare = sum(noncoding_ultrarare, na.rm = TRUE),
.groups = 'drop'
)
# Print the result
print(variants_per_gene_Cat_2_6)
# A tibble: 424 × 2
GENE noncoding_ultrarare
<chr> <int>
1 A2ML1 99
2 AAMP 10
3 ABAT 68
4 ABCC9 0
5 AC011551.3 0
6 ACADVL 28
7 ACTC1 31
8 ACTL6B 183
9 ACTN2 28
10 ADAM22 0
# ℹ 414 more rows
Filter for Category == 1 and then group and summarize
variants_per_gene_Cat_1 <- gene_data_noncoding_discovery %>%
filter(Category == 1) %>%
group_by(GENE) %>%
summarise(
noncoding_ultrarare = sum(noncoding_ultrarare, na.rm = TRUE),
.groups = 'drop'
)
# Print the result
print(variants_per_gene_Cat_1)
# A tibble: 424 × 2
GENE noncoding_ultrarare
<chr> <int>
1 A2ML1 37
2 AAMP 1
3 ABAT 50
4 ABCC9 0
5 AC011551.3 0
6 ACADVL 9
7 ACTC1 7
8 ACTL6B 130
9 ACTN2 16
10 ADAM22 0
# ℹ 414 more rows
Append panel source to the gene_data
# Ensure that the 'GENE' column in 'gene_data_noncoding_discovery' and 'Gene' in 'all_genes' are character type
gene_data_noncoding_discovery$GENE <- as.character(gene_data_noncoding_discovery$GENE)
all_genes$Gene <- as.character(all_genes$Gene)
# Find the index of each gene in 'all_genes'
# Then, use this index to assign the corresponding 'Source' to a new column in 'gene_data'
gene_data_noncoding_discovery$Source <- all_genes$Source[match(gene_data_noncoding_discovery$GENE, all_genes$Gene)]
# Now, 'gene_data' will have a new column 'Source' with the source of each gene
# based on the lookup from 'all_genes'
# View the first few rows to verify the new 'Source' has been added
head(gene_data_noncoding_discovery)
ID GENE noncoding_ultrarare Set Category Source
1 CGM0000969 RANGRF 0 Discovery 2 Cardiomyopathy
2 CGM0000969 AGL 0 Discovery 2 Cardiomyopathy
3 CGM0000969 RNF13 0 Discovery 2 Epilepsy
4 CGM0000969 PRKAG2 0 Discovery 2 Cardiomyopathy
5 CGM0000969 JPH2 1 Discovery 2 Cardiomyopathy
6 CGM0000969 RP11-156P1.2 0 Discovery 2 <NA>
Add the source to the category 1 variants list
variants_per_gene_Cat_1$Source <- all_genes$Source[match(variants_per_gene_Cat_1$GENE, all_genes$Gene)]
head(variants_per_gene_Cat_1)
# A tibble: 6 × 3
GENE noncoding_ultrarare Source
<chr> <int> <chr>
1 A2ML1 37 Cardiomyopathy
2 AAMP 1 <NA>
3 ABAT 50 Epilepsy
4 ABCC9 0 Cardiomyopathy
5 AC011551.3 0 <NA>
6 ACADVL 9 Cardiomyopathy
Add the source to the category 2-6 variants list
variants_per_gene_Cat_2_6$Source <- all_genes$Source[match(variants_per_gene_Cat_2_6$GENE, all_genes$Gene)]
head(variants_per_gene_Cat_2_6)
# A tibble: 6 × 3
GENE noncoding_ultrarare Source
<chr> <int> <chr>
1 A2ML1 99 Cardiomyopathy
2 AAMP 10 <NA>
3 ABAT 68 Epilepsy
4 ABCC9 0 Cardiomyopathy
5 AC011551.3 0 <NA>
6 ACADVL 28 Cardiomyopathy
combine the variants together now
# Add a new column to indicate the category
variants_per_gene_Cat_1 <- variants_per_gene_Cat_1 %>%
mutate(Category = "1")
# Add a new column to indicate the category range
variants_per_gene_Cat_2_6 <- variants_per_gene_Cat_2_6 %>%
mutate(Category = "2-6")
# Combine the two datasets
combined_variants <- bind_rows(variants_per_gene_Cat_1, variants_per_gene_Cat_2_6)
# Print the combined dataset
print(combined_variants)
# A tibble: 848 × 4
GENE noncoding_ultrarare Source Category
<chr> <int> <chr> <chr>
1 A2ML1 37 Cardiomyopathy 1
2 AAMP 1 <NA> 1
3 ABAT 50 Epilepsy 1
4 ABCC9 0 Cardiomyopathy 1
5 AC011551.3 0 <NA> 1
6 ACADVL 9 Cardiomyopathy 1
7 ACTC1 7 Cardiomyopathy 1
8 ACTL6B 130 Epilepsy 1
9 ACTN2 16 Cardiomyopathy 1
10 ADAM22 0 Epilepsy 1
# ℹ 838 more rows
# Filter dataset where noncoding_ultrarare > 0
noncoding_ultrarare_data <- combined_variants %>%
filter(noncoding_ultrarare > 0) %>%
filter(!is.na(Source))
# Label function for noncoding_ultrarare plot
custom_labeller_noncoding <- function(variable, value) {
if (variable == "Category") {
return(ifelse(value == "1", "Control", "Case"))
}
return(value)
}
# Create the noncoding_ultrarare plot
noncoding_ultrarare_plot <- ggplot(noncoding_ultrarare_data, aes(x = GENE, y = Category, fill = noncoding_ultrarare)) +
geom_tile() +
scale_fill_gradientn(colors = c("#3C8C78", "#dcb43c", "#ae7e46"),
values = scales::rescale(c(0, 0.5, 1))) +
facet_wrap(~ Source, scales = "free_x", ncol = 1) +
labs(title = "Noncoding Ultrarare Heatmap",
x = "Gene",
y = "Category",
fill = "Count") +
theme_cowplot(12) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
strip.background = element_blank(),
strip.placement = "outside") +
scale_y_discrete(labels = function(x) custom_labeller_noncoding("Category", x))
# Print the plot
print(noncoding_ultrarare_plot)
Combined******************************************************************************************************************
# Read the cohort data
total_data <- read.table('/Users/tmt6052/Library/CloudStorage/OneDrive-NorthwesternUniversity/SFRN_paper_items/R_analysis/final_for_paper/combined_data_for_manuscript.txt', header = TRUE, sep = '\t', stringsAsFactors = FALSE)
combined_data <- total_data %>% filter(Set =="Discovery")
replication_data <- total_data %>% filter(Set =="Replication")
Categorize the Data based on original categories and arrest status
#Clump by category
categorized_combined_data <- combined_data %>%
mutate(Category = case_when(
arrest_group == 1 ~ "Control",
arrest_group %in% 2:6 ~ "zCase"
)) %>%
filter(!is.na(Category))
#Convert Category to a factor
categorized_combined_data$Category <- as.factor(categorized_combined_data$Category)
############################ ############## ############## replication
#Clump replication by category z used for formatting reasons I am too lazy to fix
categorized_replication_data <- replication_data %>%
mutate(Category = case_when(
arrest_group == 1 ~ "Control",
arrest_group %in% 2:6 ~ "zCase"
)) %>%
filter(!is.na(Category))
# Convert Category to a factor
categorized_replication_data$Category <- as.factor(categorized_replication_data$Category)
Collapse the rare vars
categorized_combined_data <- categorized_combined_data %>%
mutate(Epilepsy_rare = (Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001),
Epilepsy_ultrarare = (Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton))
categorized_combined_data <- categorized_combined_data %>%
mutate(Epilepsy_null = (Epilepsy_start_lost + Epilepsy_stop_gained))
categorized_combined_data <- categorized_combined_data %>%
mutate(Cardiomyopathy_rare = (Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001),
Cardiomyopathy_ultrarare = (Cardiomyopathy_missense_variant_0.00001 + Cardiomyopathy_missense_singleton))
categorized_combined_data <- categorized_combined_data %>%
mutate(Cardiomyopathy_null = (Cardiomyopathy_start_lost + Cardiomyopathy_stop_gained))
categorized_combined_data <- categorized_combined_data %>%
mutate(Epilepsy_noncoding_rare = (Epilepsy_noncoding_gnomad_0.001 + Epilepsy_noncoding_cohort_0.01 + Epilepsy_noncoding_cohort_singleton))
categorized_combined_data <- categorized_combined_data %>%
mutate(Cardiomyopathy_noncoding_rare = (Cardiomyopathy_noncoding_gnomad_0.001 + Cardiomyopathy_noncoding_cohort_0.01 + Cardiomyopathy_noncoding_cohort_singleton))
############################ ############## ############## replication
categorized_replication_data <- categorized_replication_data %>%
mutate(Epilepsy_rare = (Epilepsy_missense_variant_0.01 + Epilepsy_missense_variant_0.001),
Epilepsy_ultrarare = (Epilepsy_missense_variant_0.00001 + Epilepsy_missense_singleton))
categorized_replication_data <- categorized_replication_data %>%
mutate(Epilepsy_null = (Epilepsy_start_lost + Epilepsy_stop_gained))
categorized_replication_data <- categorized_replication_data %>%
mutate(Cardiomyopathy_rare = (Cardiomyopathy_missense_variant_0.01 + Cardiomyopathy_missense_variant_0.001),
Cardiomyopathy_ultrarare = (Cardiomyopathy_missense_variant_0.00001 + Cardiomyopathy_missense_singleton))
categorized_replication_data <- categorized_replication_data %>%
mutate(Cardiomyopathy_null = (Cardiomyopathy_start_lost + Cardiomyopathy_stop_gained))
categorized_replication_data <- categorized_replication_data %>%
mutate(Epilepsy_noncoding_rare = (Epilepsy_noncoding_gnomad_0.001 + Epilepsy_noncoding_cohort_0.01 + Epilepsy_noncoding_cohort_singleton))
categorized_replication_data <- categorized_replication_data %>%
mutate(Cardiomyopathy_noncoding_rare = (Cardiomyopathy_noncoding_gnomad_0.001 + Cardiomyopathy_noncoding_cohort_0.01 + Cardiomyopathy_noncoding_cohort_singleton))
Perform multivariate Logistic Regression on everything
model <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare + Cardiomyopathy_null + Epilepsy_null,
data = categorized_combined_data, family = binomial())
model
Call: glm(formula = Category ~ Normalized_Heart_rate + Normalized_PR_interval +
Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF +
Normalized_LVESV + Normalized_SVI + Normalized_Afib + Epilepsy_rare +
Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare +
PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_noncoding_rare +
Epilepsy_noncoding_rare + Cardiomyopathy_null + Epilepsy_null,
family = binomial(), data = categorized_combined_data)
Coefficients:
(Intercept) Normalized_Heart_rate
-0.685379 -0.081183
Normalized_PR_interval Normalized_QTc
0.080199 -0.057222
Normalized_BP Normalized_QRS
0.280711 0.405413
Normalized_LVEF Normalized_LVESV
-0.183223 0.100668
Normalized_SVI Normalized_Afib
0.016921 -0.110002
Epilepsy_rare Epilepsy_ultrarare
-0.041446 -0.011901
Cardiomyopathy_rare Cardiomyopathy_ultrarare
-0.025813 -0.006447
PLP_Epilepsy PLP_Cardiomyopathy
-0.223524 1.218792
Cardiomyopathy_noncoding_rare Epilepsy_noncoding_rare
-0.010962 0.026840
Cardiomyopathy_null Epilepsy_null
0.921670 0.425896
Degrees of Freedom: 1042 Total (i.e. Null); 1023 Residual
Null Deviance: 1445
Residual Deviance: 1235 AIC: 1275
Compute the scores
# tell it how to make the scores
scores <- predict(model, type = "link")
scores_replication <- predict(model, newdata = categorized_replication_data, type = "link")
# Add scores to the dataframes
categorized_combined_data$scores <- scores
categorized_replication_data$scores_replication <- scores_replication
Perform multivariate Logistic Regressions based only on the input paramaters, and subsets
model_cardiomyopathy_rare <- glm(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null, data = categorized_combined_data, family = binomial())
model_epilepsy_rare <- glm(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null, data = categorized_combined_data, family = binomial())
model_noncoding_rare <- glm(Category ~ Epilepsy_noncoding_rare + Cardiomyopathy_noncoding_rare, data = categorized_combined_data, family = binomial())
model_common <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib,
data = categorized_combined_data, family = binomial())
model_coding_rare <- glm(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_null + Epilepsy_null, data = categorized_combined_data, family = binomial())
model_epilepsy_and_common <- glm(Category ~ PLP_Epilepsy + Epilepsy_rare + Epilepsy_ultrarare + Epilepsy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib, data = categorized_combined_data, family = binomial())
model_cardiac_and_common <- glm(Category ~ PLP_Cardiomyopathy + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + Cardiomyopathy_null + Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib, data = categorized_combined_data, family = binomial())
model_common_and_coding <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy + Cardiomyopathy_null + Epilepsy_null,
data = categorized_combined_data, family = binomial())
Compute the rest of the scores and determine rank-based quintiles for all scores
categorized_combined_data$cardiac_variant_score <- predict(model_cardiomyopathy_rare, type = "link")
categorized_combined_data$epilepsy_variant_score <- predict(model_epilepsy_rare, type = "link")
categorized_combined_data$noncoding_variant_score <- predict(model_noncoding_rare, type = "link")
categorized_combined_data$common_variant_score <- predict(model_common, type = "link")
categorized_combined_data$common_and_coding_score <- predict(model_common_and_coding, type = "link")
categorized_combined_data$coding_rare_score <- predict(model_coding_rare, type = "link")
categorized_combined_data$epilepsy_and_common_score <- predict(model_epilepsy_and_common, type = "link")
categorized_combined_data$cardiac_and_common_score <- predict(model_cardiac_and_common, type = "link")
categorized_combined_data <- categorized_combined_data %>%
mutate(
rank = rank(scores, ties.method = "first"),
score_quintiles = ceiling(rank / (n() / 5)),
rank_cardiac = rank(cardiac_variant_score, ties.method = "first"),
cardiac_score_quintiles = ceiling(rank_cardiac / (n() / 5)),
rank_epilepsy = rank(epilepsy_variant_score, ties.method = "first"),
epilepsy_score_quintiles = ceiling(rank_epilepsy / (n() / 5)),
rank_noncoding = rank(noncoding_variant_score, ties.method = "first"),
noncoding_score_quintiles = ceiling(rank_noncoding / (n() / 5)),
rank_common = rank(common_variant_score, ties.method = "first"),
common_score_quintiles = ceiling(rank_common / (n() / 5)),
rank_common_and_coding = rank(common_and_coding_score, ties.method = "first"),
common_and_coding_score_quintiles = ceiling(rank_common_and_coding / (n() / 5)),
rank_model_coding_rare = rank(coding_rare_score, ties.method = "first"),
coding_rare_score_quintiles = ceiling(rank_model_coding_rare / (n() / 5)),
rank_epilepsy_and_common = rank(epilepsy_and_common_score, ties.method = "first"),
epilepsy_and_common_score_quintiles = ceiling(rank_epilepsy_and_common / (n() / 5)),
rank_cardiac_and_common = rank(cardiac_and_common_score, ties.method = "first"),
cardiac_and_common_score_quintiles = ceiling(rank_cardiac_and_common / (n() / 5))
)
categorized_combined_data <- categorized_combined_data %>%
mutate(
probability = plogis(scores),
cardiac_probability = plogis(cardiac_variant_score),
epilepsy_probability = plogis(epilepsy_variant_score),
noncoding_probability = plogis(noncoding_variant_score),
common_probability = plogis(common_variant_score),
common_and_coding_probability = plogis(common_and_coding_score),
coding_rare_probability = plogis(coding_rare_score),
epilepsy_and_common_probability = plogis(epilepsy_and_common_score),
cardiac_and_common_probability = plogis(cardiac_and_common_score),
)
# Function to calculate odds ratio and CI
calculate_odds_ratios <- function(data, category_column, quintile_column) {
# Compute counts and initial odds for each quintile
odds_data <- data %>%
dplyr::group_by({{ quintile_column }}) %>%
dplyr::summarise(
count_category_1 = sum({{ category_column }} == "Control", na.rm = TRUE),
count_category_2_6 = sum({{ category_column }} == "zCase", na.rm = TRUE),
.groups = 'drop'
) %>%
dplyr::mutate(
odds = count_category_2_6 / count_category_1
)
# Retrieve the odds of the first quintile as the reference
first_quintile_odds <- odds_data$odds[1]
# Calculate the odds ratio relative to the first quintile
odds_data <- odds_data %>%
dplyr::mutate(
odds_ratio = odds / first_quintile_odds,
se_log_odds_ratio = sqrt(1 / count_category_1 + 1 / count_category_2_6),
lower_ci = exp(log(odds_ratio) - 1.96 * se_log_odds_ratio),
upper_ci = exp(log(odds_ratio) + 1.96 * se_log_odds_ratio)
)
return(odds_data)
}
# Apply function to each odds category
combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, score_quintiles)
cardiac_odds_summary = calculate_odds_ratios(categorized_combined_data,Category, cardiac_score_quintiles)
epilepsy_summary = calculate_odds_ratios(categorized_combined_data,Category, epilepsy_score_quintiles)
common_summary = calculate_odds_ratios(categorized_combined_data,Category, common_score_quintiles)
noncoding_summary = calculate_odds_ratios(categorized_combined_data,Category, noncoding_score_quintiles)
common_and_coding_summary = calculate_odds_ratios(categorized_combined_data,Category, common_and_coding_score_quintiles)
coding_rare_summary = calculate_odds_ratios(categorized_combined_data,Category, coding_rare_score_quintiles)
commmon_and_epilepsy_summary = calculate_odds_ratios(categorized_combined_data,Category, epilepsy_and_common_score_quintiles)
cardiac_and_common_summary = calculate_odds_ratios(categorized_combined_data,Category, cardiac_and_common_score_quintiles)
################################################################################################################ replication
categorized_replication_data$cardiac_variant_score_replication <- predict(model_cardiomyopathy_rare, newdata = categorized_replication_data, type = "link")
categorized_replication_data$epilepsy_variant_score_replication <- predict(model_epilepsy_rare, newdata = categorized_replication_data, type = "link")
categorized_replication_data$noncoding_variant_score_replication <- predict(model_noncoding_rare, newdata = categorized_replication_data, type = "link")
categorized_replication_data$common_variant_score_replication <- predict(model_common, newdata = categorized_replication_data, type = "link")
categorized_replication_data$common_and_coding_score_replication <- predict(model_common_and_coding, newdata = categorized_replication_data, type = "link")
categorized_replication_data$coding_rare_score_replication <- predict(model_coding_rare, newdata = categorized_replication_data, type = "link")
categorized_replication_data$epilepsy_and_common_score_replication <- predict(model_epilepsy_and_common, newdata = categorized_replication_data, type = "link")
categorized_replication_data$cardiac_and_common_score_replication <- predict(model_cardiac_and_common, newdata = categorized_replication_data, type = "link")
categorized_replication_data <- categorized_replication_data %>%
mutate(
rank = rank(scores_replication, ties.method = "first"),
score_quintiles = ceiling(rank / (n() / 5)),
rank_cardiac = rank(cardiac_variant_score_replication, ties.method = "first"),
cardiac_score_quintiles = ceiling(rank_cardiac / (n() / 5)),
rank_epilepsy = rank(epilepsy_variant_score_replication, ties.method = "first"),
epilepsy_score_quintiles = ceiling(rank_epilepsy / (n() / 5)),
rank_noncoding = rank(noncoding_variant_score_replication, ties.method = "first"),
noncoding_score_quintiles = ceiling(rank_noncoding / (n() / 5)),
rank_common = rank(common_variant_score_replication, ties.method = "first"),
common_score_quintiles = ceiling(rank_common / (n() / 5)),
rank_common_and_coding = rank(common_and_coding_score_replication, ties.method = "first"),
common_and_coding_score_quintiles = ceiling(rank_common_and_coding / (n() / 5)),
rank_model_coding_rare = rank(coding_rare_score_replication, ties.method = "first"),
coding_rare_score_quintiles = ceiling(rank_model_coding_rare / (n() / 5)),
rank_epilepsy_and_common = rank(epilepsy_and_common_score_replication, ties.method = "first"),
epilepsy_and_common_score_quintiles = ceiling(rank_epilepsy_and_common / (n() / 5)),
rank_cardiac_and_common = rank(cardiac_and_common_score_replication, ties.method = "first"),
cardiac_and_common_score_quintiles = ceiling(rank_cardiac_and_common / (n() / 5))
)
categorized_replication_data <- categorized_replication_data %>%
mutate(
probability = plogis(scores_replication),
cardiac_probability = plogis(cardiac_variant_score_replication),
epilepsy_probability = plogis(epilepsy_variant_score_replication),
noncoding_probability = plogis(noncoding_variant_score_replication),
common_probability = plogis(common_variant_score_replication),
common_and_coding_probability = plogis(common_and_coding_score_replication),
coding_rare_probability = plogis(coding_rare_score_replication),
epilepsy_and_common_probability = plogis(epilepsy_and_common_score_replication),
cardiac_and_common_probability = plogis(cardiac_and_common_score_replication),
)
# Apply function to each odds category
combined_odds_summary_replication = calculate_odds_ratios(categorized_replication_data, Category, score_quintiles)
cardiac_odds_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, cardiac_score_quintiles)
epilepsy_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, epilepsy_score_quintiles)
common_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, common_score_quintiles)
noncoding_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, noncoding_score_quintiles)
common_and_coding_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, common_and_coding_score_quintiles)
coding_rare_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, coding_rare_score_quintiles)
commmon_and_epilepsy_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, epilepsy_and_common_score_quintiles)
cardiac_and_common_summary_replication = calculate_odds_ratios(categorized_replication_data,Category, cardiac_and_common_score_quintiles)
plot
plot(log2(combined_odds_summary$odds_ratio),
ylim = c(
log2(min(c(combined_odds_summary$lower_ci, cardiac_odds_summary$lower_ci, epilepsy_summary$lower_ci,
common_summary$lower_ci, noncoding_summary$lower_ci, common_and_coding_summary$lower_ci,
coding_rare_summary$lower_ci))),
log2(max(c(combined_odds_summary$upper_ci, cardiac_odds_summary$upper_ci, epilepsy_summary$upper_ci,
common_summary$upper_ci, noncoding_summary$upper_ci, common_and_coding_summary$upper_ci,
coding_rare_summary$upper_ci)))
),
pch = 19, xlab = "quintile", ylab = "Log2(Odds Ratio)",
main = "Log2(Odds Ratio) Across quintiles of Score with 95% CI",
# Add error bars for 95% CI - Combined
xaxt = "n", col = "#3C8C78")
lines(1:length(combined_odds_summary$odds_ratio), log2(combined_odds_summary$odds_ratio), col = "#3C8C78")
arrows(x0 = 1:length(combined_odds_summary$odds_ratio),
y0 = log2(combined_odds_summary$lower_ci),
y1 = log2(combined_odds_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#3C8C78")
# Cardiac
lines(1:length(cardiac_odds_summary$odds_ratio), log2(cardiac_odds_summary$odds_ratio), col = "#2E86C1")
points(log2(cardiac_odds_summary$odds_ratio), pch = 19, col = "#2E86C1")
arrows(x0 = 1:length(cardiac_odds_summary$odds_ratio),
y0 = log2(cardiac_odds_summary$lower_ci),
y1 = log2(cardiac_odds_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#2E86C1")
# Epilepsy
lines(1:length(epilepsy_summary$odds_ratio), log2(epilepsy_summary$odds_ratio), col = "#A93226")
points(log2(epilepsy_summary$odds_ratio), pch = 19, col = "#A93226")
arrows(x0 = 1:length(epilepsy_summary$odds_ratio),
y0 = log2(epilepsy_summary$lower_ci),
y1 = log2(epilepsy_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#A93226")
# Common
lines(1:length(common_summary$odds_ratio), log2(common_summary$odds_ratio), col = "#229954")
points(log2(common_summary$odds_ratio), pch = 19, col = "#229954")
arrows(x0 = 1:length(common_summary$odds_ratio),
y0 = log2(common_summary$lower_ci),
y1 = log2(common_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#229954")
# Noncoding
lines(1:length(noncoding_summary$odds_ratio), log2(noncoding_summary$odds_ratio), col = "#D68910")
points(log2(noncoding_summary$odds_ratio), pch = 19, col = "#D68910")
arrows(x0 = 1:length(noncoding_summary$odds_ratio),
y0 = log2(noncoding_summary$lower_ci),
y1 = log2(noncoding_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#D68910")
# common and coding summary
lines(1:length(common_and_coding_summary$odds_ratio), log2(common_and_coding_summary$odds_ratio), col = "Black")
points(log2(common_and_coding_summary$odds_ratio), pch = 19, col = "Black")
arrows(x0 = 1:length(common_and_coding_summary$odds_ratio),
y0 = log2(common_and_coding_summary$lower_ci),
y1 = log2(common_and_coding_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "Black")
# coding rare summary
lines(1:length(coding_rare_summary$odds_ratio), log2(coding_rare_summary$odds_ratio), col = "Pink")
points(log2(coding_rare_summary$odds_ratio), pch = 19, col = "Pink")
arrows(x0 = 1:length(coding_rare_summary$odds_ratio),
y0 = log2(coding_rare_summary$lower_ci),
y1 = log2(coding_rare_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "Pink")
# Add x-axis labels for quintiles
axis(1, at = 1:length(combined_odds_summary$odds_ratio), labels = TRUE)
# legend
legend("topleft", legend = c("Combined", "Cardiac", "Epilepsy", "Common", "Noncoding", "common_and_coding", "model_coding_rare"),
col = c("#3C8C78", "#2E86C1", "#A93226", "#229954", "#D68910", "Black", "Pink"), pch = 19, cex = 0.8)
plot replication
plot(log2(combined_odds_summary_replication$odds_ratio),
ylim = c(-1,10),
pch = 19, xlab = "quintile", ylab = "Log2(Odds Ratio)",
main = "Log2(Odds Ratio) Across quintiles of Score with 95% CI",
# Add error bars for 95% CI - Combined
xaxt = "n", col = "#3C8C78")
lines(1:length(combined_odds_summary_replication$odds_ratio), log2(combined_odds_summary_replication$odds_ratio), col = "#3C8C78")
arrows(x0 = 1:length(combined_odds_summary_replication$odds_ratio),
y0 = log2(combined_odds_summary_replication$lower_ci),
y1 = log2(combined_odds_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#3C8C78")
# Cardiac
lines(1:length(cardiac_odds_summary_replication$odds_ratio), log2(cardiac_odds_summary_replication$odds_ratio), col = "#2E86C1")
points(log2(cardiac_odds_summary_replication$odds_ratio), pch = 19, col = "#2E86C1")
arrows(x0 = 1:length(cardiac_odds_summary_replication$odds_ratio),
y0 = log2(cardiac_odds_summary_replication$lower_ci),
y1 = log2(cardiac_odds_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#2E86C1")
# Epilepsy
lines(1:length(epilepsy_summary_replication$odds_ratio), log2(epilepsy_summary_replication$odds_ratio), col = "#A93226")
points(log2(epilepsy_summary_replication$odds_ratio), pch = 19, col = "#A93226")
arrows(x0 = 1:length(epilepsy_summary_replication$odds_ratio),
y0 = log2(epilepsy_summary_replication$lower_ci),
y1 = log2(epilepsy_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#A93226")
# Common
lines(1:length(common_summary_replication$odds_ratio), log2(common_summary_replication$odds_ratio), col = "#229954")
points(log2(common_summary_replication$odds_ratio), pch = 19, col = "#229954")
arrows(x0 = 1:length(common_summary_replication$odds_ratio),
y0 = log2(common_summary_replication$lower_ci),
y1 = log2(common_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#229954")
# Noncoding
lines(1:length(noncoding_summary_replication$odds_ratio), log2(noncoding_summary_replication$odds_ratio), col = "#D68910")
points(log2(noncoding_summary_replication$odds_ratio), pch = 19, col = "#D68910")
arrows(x0 = 1:length(noncoding_summary_replication$odds_ratio),
y0 = log2(noncoding_summary_replication$lower_ci),
y1 = log2(noncoding_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#D68910")
# common and coding summary
lines(1:length(common_and_coding_summary_replication$odds_ratio), log2(common_and_coding_summary_replication$odds_ratio), col = "Black")
points(log2(common_and_coding_summary_replication$odds_ratio), pch = 19, col = "Black")
arrows(x0 = 1:length(common_and_coding_summary_replication$odds_ratio),
y0 = log2(common_and_coding_summary_replication$lower_ci),
y1 = log2(common_and_coding_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "Black")
# coding rare summary
lines(1:length(coding_rare_summary_replication$odds_ratio), log2(coding_rare_summary_replication$odds_ratio), col = "Pink")
points(log2(coding_rare_summary_replication$odds_ratio), pch = 19, col = "Pink")
arrows(x0 = 1:length(coding_rare_summary_replication$odds_ratio),
y0 = log2(coding_rare_summary_replication$lower_ci),
y1 = log2(coding_rare_summary_replication$upper_ci),
code = 3, angle = 90, length = 0.05, col = "Pink")
# Add x-axis labels for quintiles
axis(1, at = 1:length(combined_odds_summary_replication$odds_ratio), labels = TRUE)
# legend
legend("topleft", legend = c("Combined", "Cardiac", "Epilepsy", "Common", "Noncoding", "common_and_coding", "model_coding_rare"),
col = c("#3C8C78", "#2E86C1", "#A93226", "#229954", "#D68910", "Black", "Pink"), pch = 19, cex = 0.8)
Z test for odds significance
compare_odds <- function(odds1, odds2) {
# Calculate the difference in log odds
log_odds1 <- log(odds1$odds_ratio)
log_odds2 <- log(odds2$odds_ratio)
delta_log_odds <- log_odds1 - log_odds2
# Calculate the standard error of the difference
se1 <- odds1$se_log_odds_ratio
se2 <- odds2$se_log_odds_ratio
se_delta_log_odds <- sqrt(se1^2 + se2^2)
# Z-test for each quintile
z_values <- delta_log_odds / se_delta_log_odds
p_values <- 2 * pnorm(abs(z_values), lower.tail = FALSE)
# Return a data frame with the results
return(data.frame(quintile = 1:length(p_values), p_values = p_values))
}
# Apply the function to compare 'coding_rare_summary', 'common_and_coding_summary', and 'combined_odds_summary'
p_values_coding_vs_common_coding <- compare_odds(coding_rare_summary, common_and_coding_summary)
p_values_coding_vs_model <- compare_odds(coding_rare_summary, combined_odds_summary)
p_values_common_coding_vs_model <- compare_odds(common_and_coding_summary, combined_odds_summary)
# Combine the data frames with an identifier for each comparison
p_values_coding_vs_common_coding$comparison <- "Coding vs Common and Coding"
p_values_coding_vs_model$comparison <- "Coding vs Model"
p_values_common_coding_vs_model$comparison <- "Common and Coding vs Model"
# Bind the rows together
combined_p_values <- bind_rows(p_values_coding_vs_common_coding,
p_values_coding_vs_model,
p_values_common_coding_vs_model)
# Convert quintile to a factor for better axis labeling
combined_p_values$quintile <- factor(combined_p_values$quintile, levels = 1:5)
combined_p_values
quintile p_values comparison
1 1 1.0000000000 Coding vs Common and Coding
2 2 0.2755794672 Coding vs Common and Coding
3 3 0.1193593972 Coding vs Common and Coding
4 4 0.0024478936 Coding vs Common and Coding
5 5 0.0375063182 Coding vs Common and Coding
6 1 1.0000000000 Coding vs Model
7 2 0.0802036186 Coding vs Model
8 3 0.0143006222 Coding vs Model
9 4 0.0005882193 Coding vs Model
10 5 0.0001029296 Coding vs Model
11 1 1.0000000000 Common and Coding vs Model
12 2 0.5116308088 Common and Coding vs Model
13 3 0.3728300451 Common and Coding vs Model
14 4 0.6892061513 Common and Coding vs Model
15 5 0.0598086456 Common and Coding vs Model
Calculate the scores per category and plot them
#plot
common_variant_score_plot <- ggplot(categorized_combined_data, aes(x = Category, y = common_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2, 2) +
theme_cowplot(12)
cardiac_variant_score_plot <- ggplot(categorized_combined_data, aes(x = Category, y = cardiac_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-1, 0.6) +
theme_cowplot(12)
epilepsy_variant_score_plot <- ggplot(categorized_combined_data, aes(x = Category, y = epilepsy_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-1, 0.6) +
theme_cowplot(12)
noncoding_variant_score_plot <- ggplot(categorized_combined_data, aes(x = Category, y = noncoding_variant_score, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-1, 0.75) +
theme_cowplot(12)
scores_plot <- ggplot(categorized_combined_data, aes(x = Category, y = scores, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-3, 3) +
theme_cowplot(12)
suppressWarnings(print(common_variant_score_plot))
suppressWarnings(print(cardiac_variant_score_plot))
suppressWarnings(print(epilepsy_variant_score_plot))
suppressWarnings(print(noncoding_variant_score_plot))
suppressWarnings(print(scores_plot))
replication
NOW PLOT THE Z normalized replication data
mean_common_replication <- mean(categorized_replication_data$common_variant_score_replication)
sd_common_replication <- sd(categorized_replication_data$common_variant_score_replication)
mean_cardiac_replication <- mean(categorized_replication_data$cardiac_variant_score_replication)
sd_cardiac_replication <- sd(categorized_replication_data$cardiac_variant_score_replication)
mean_epilepsy_replication <- mean(categorized_replication_data$epilepsy_variant_score_replication)
sd_epilepsy_replication <- sd(categorized_replication_data$epilepsy_variant_score_replication)
mean_noncoding_replication <- mean(categorized_replication_data$noncoding_variant_score_replication)
sd_noncoding_replication <- sd(categorized_replication_data$noncoding_variant_score_replication)
mean_scores_replication <- mean(categorized_replication_data$scores_replication)
sd_scores_replication <- sd(categorized_replication_data$scores_replication)
common_variant_score_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (common_variant_score_replication - mean_common_replication) / sd_common_replication, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
cardiac_variant_score_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (cardiac_variant_score_replication - mean_cardiac_replication)/ sd_cardiac_replication, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
epilepsy_variant_score_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (epilepsy_variant_score_replication - mean_epilepsy_replication)/ sd_epilepsy_replication, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
noncoding_variant_score_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (noncoding_variant_score_replication - mean_noncoding_replication)/ sd_noncoding_replication, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
scores_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (scores_replication - mean_scores_replication)/ sd_scores_replication, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
suppressWarnings(print(common_variant_score_replication_plot))
suppressWarnings(print(cardiac_variant_score_replication_plot))
Notch went outside hinges
ℹ Do you want `notch = FALSE`?
suppressWarnings(print(epilepsy_variant_score_replication_plot))
suppressWarnings(print(noncoding_variant_score_replication_plot))
suppressWarnings(print(scores_replication_plot))
Compute the wilcox.tests
wilcox.test_cardiomyopathy <- wilcox.test(cardiac_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_cardiomyopathy
Wilcoxon rank sum test with continuity correction
data: cardiac_variant_score by Category
W = 91701, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0
wilcox.test_epilepsy <- wilcox.test(epilepsy_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_epilepsy
Wilcoxon rank sum test with continuity correction
data: epilepsy_variant_score by Category
W = 104718, p-value = 1.487e-10
alternative hypothesis: true location shift is not equal to 0
wilcox.test_common <- wilcox.test(common_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_common
Wilcoxon rank sum test with continuity correction
data: common_variant_score by Category
W = 93124, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0
wilcox.test_noncoding <- wilcox.test(noncoding_variant_score ~ Category, data = categorized_combined_data)
wilcox.test_noncoding
Wilcoxon rank sum test with continuity correction
data: noncoding_variant_score by Category
W = 115548, p-value = 2.943e-05
alternative hypothesis: true location shift is not equal to 0
wilcox.test_combined <- wilcox.test(scores ~ Category, data = categorized_combined_data)
wilcox.test_combined
Wilcoxon rank sum test with continuity correction
data: scores by Category
W = 71316, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0
replication
wilcox.test_cardiomyopathy_replication <- wilcox.test(cardiac_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_cardiomyopathy_replication
Wilcoxon rank sum test with continuity correction
data: cardiac_variant_score_replication by Category
W = 430, p-value = 3.978e-14
alternative hypothesis: true location shift is not equal to 0
wilcox.test_epilepsy_replication <- wilcox.test(epilepsy_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_epilepsy_replication
Wilcoxon rank sum test with continuity correction
data: epilepsy_variant_score_replication by Category
W = 734.5, p-value = 1.268e-09
alternative hypothesis: true location shift is not equal to 0
wilcox.test_common_replication <- wilcox.test(common_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_common_replication
Wilcoxon rank sum test with continuity correction
data: common_variant_score_replication by Category
W = 1804, p-value = 0.4004
alternative hypothesis: true location shift is not equal to 0
wilcox.test_noncoding_replication <- wilcox.test(noncoding_variant_score_replication ~ Category, data = categorized_replication_data)
wilcox.test_noncoding_replication
Wilcoxon rank sum test with continuity correction
data: noncoding_variant_score_replication by Category
W = 497, p-value = 4.796e-13
alternative hypothesis: true location shift is not equal to 0
wilcox.test_combined_replication <- wilcox.test(scores_replication ~ Category, data = categorized_replication_data)
wilcox.test_combined_replication
Wilcoxon rank sum test with continuity correction
data: scores_replication by Category
W = 442, p-value = 6.391e-14
alternative hypothesis: true location shift is not equal to 0
Plot the distribution of the categories by score
distribution_score_plot <- ggplot(categorized_combined_data, aes(x = scores, fill = Category)) +
geom_density(aes(y = ..density..), alpha = 0.5, adjust = 1) +
scale_fill_manual(values = group_colors) +
scale_x_continuous(limits = c(-3, 4)) +
labs(title = "Normalized Density Plots of Scores by Category",
x = "Score",
y = "Density") +
theme_cowplot(12)
suppressWarnings(print(distribution_score_plot))
Plot the filled density of the categories by score
density_score_plot <- ggplot(categorized_combined_data, aes(x = scores, fill = Category)) +
geom_density(position = "fill", alpha = 0.5) +
scale_y_continuous(labels = scales::percent) +
scale_x_continuous(limits = c(-3, 4)) +
labs(title = "Stacked Density of Scores by Category",
x = "Score",
y = "Fraction") +
theme_cowplot() +
scale_fill_manual(values = group_colors)
# Print the plot
suppressWarnings(print(density_score_plot))
Plot the ROC and Precision-recall
# Convert 'Category' into a binary format: 0 for control (1), 1 for case (2-4)
categorized_combined_data$binary_outcome <- ifelse(categorized_combined_data$Category == "Control", 0, 1)
#Full model
roc_result_full <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$probability, plot = TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
paste("Full model AUC:")
[1] "Full model AUC:"
auc(roc_result_full)
Area under the curve: 0.7375
#common variants
roc_result_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$common_probability, plot = TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
paste("commonvariant model AUC:")
[1] "commonvariant model AUC:"
auc(roc_result_common)
Area under the curve: 0.6573
#coding variants
roc_result_coding <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$coding_rare_probability, plot = TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
paste("coding variant model AUC:")
[1] "coding variant model AUC:"
auc(roc_result_coding)
Area under the curve: 0.6709
#common and coding variants
roc_result_common_and_coding <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$common_and_coding_probability, plot = TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
paste("common and coding variant model AUC:")
[1] "common and coding variant model AUC:"
auc(roc_result_common_and_coding)
Area under the curve: 0.7222
Plot the ROC and Precision-recall
# Convert 'Category' into a binary format: 0 for control (1), 1 for case (2-4)
categorized_replication_data$binary_outcome <- ifelse(categorized_replication_data$Category == "Control", 0, 1)
roc_result_full_replication <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$probability, plot = TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
auc(roc_result_full_replication)
Area under the curve: 0.8882
#compute the ones not plotted
roc_result_cardiac <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$cardiac_probability, plot = FALSE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
roc_result_epilepsy <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$epilepsy_probability, plot = FALSE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
roc_result_epilepsy_and_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$epilepsy_and_common_probability, plot = FALSE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
roc_result_cardiac_and_common <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$cardiac_and_common_probability, plot = FALSE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
# Calculate AUC for each ROC object
auc_epilepsy <- auc(roc_result_epilepsy)
auc_cardiac <- auc(roc_result_cardiac)
auc_common <- auc(roc_result_common)
auc_epilepsy_and_common <- auc(roc_result_epilepsy_and_common)
auc_cardiac_and_common <- auc(roc_result_cardiac_and_common)
auc_common_and_coding <- auc(roc_result_common_and_coding)
auc_coding <- auc(roc_result_coding)
auc_full <- auc(roc_result_full)
# Base AUC for random chance
auc_random_chance <- 0.5
# Create a data frame for plotting
percentage_changes_df <- data.frame(
Model = c("Epilepsy","Cardiac","Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
PercentageChange = c(
(auc_epilepsy - auc_random_chance) / auc_random_chance * 100,
(auc_cardiac - auc_random_chance) / auc_random_chance * 100,
(auc_common - auc_random_chance) / auc_random_chance * 100,
(auc_epilepsy_and_common - auc_random_chance) / auc_random_chance * 100,
(auc_cardiac_and_common - auc_random_chance) / auc_random_chance * 100,
(auc_common_and_coding - auc_random_chance) / auc_random_chance * 100,
(auc_coding - auc_random_chance) / auc_random_chance * 100,
(auc_full - auc_random_chance) / auc_random_chance * 100
)
)
auc_df <- data.frame(
Model = c("Epilepsy","Cardiac","Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Coding", "Full"),
auc = c(
(auc_epilepsy),
(auc_cardiac),
(auc_common),
(auc_epilepsy_and_common),
(auc_cardiac_and_common),
(auc_common_and_coding),
(auc_coding),
(auc_full)
)
)
# Set factor levels
percentage_changes_df$Model <- factor(percentage_changes_df$Model, levels = c(
"Epilepsy","Cardiac","Coding", "Common", "Epilepsy and Common", "Cardiac and Common", "Common and Coding", "Full"
))
# Plot
auc_performance_plot <- ggplot(percentage_changes_df, aes(x = Model, y = PercentageChange, fill = Model)) +
geom_bar(stat = "identity", position = position_dodge()) +
theme_minimal() +
labs(title = "Percentage Change Over Random Chance AUC",
y = "Percentage Change (%)",
x = "") +
theme_cowplot(12)
print(auc_performance_plot)
# Calculate p-values using DeLong's test
p_values <- list(
"Epilepsy vs CMAR" = roc.test(roc_result_epilepsy, roc_result_cardiac)$p.value,
"CMAR vs Common" = roc.test(roc_result_common, roc_result_cardiac)$p.value,
"Epilepsy vs Common" = roc.test(roc_result_common, roc_result_epilepsy)$p.value,
"Coding vs CMAR" = roc.test(roc_result_coding, roc_result_cardiac)$p.value,
"Coding vs Epilepsy" = roc.test(roc_result_coding, roc_result_epilepsy)$p.value,
"CMAR and Common vs Common" = roc.test(roc_result_cardiac_and_common, roc_result_common)$p.value,
"Common vs Coding" = roc.test(roc_result_common, roc_result_coding)$p.value,
"CMAR and common vs Common and Coding" = roc.test(roc_result_cardiac_and_common, roc_result_common_and_coding)$p.value,
"Common vs Common and Coding" = roc.test(roc_result_common, roc_result_common_and_coding)$p.value,
"Full vs Common and Coding" = roc.test(roc_result_common_and_coding, roc_result_full)$p.value
)
# Print p-values
print(p_values)
$`Epilepsy vs CMAR`
[1] 0.01751543
$`CMAR vs Common`
[1] 0.8050644
$`Epilepsy vs Common`
[1] 0.03735391
$`Coding vs CMAR`
[1] 0.3745788
$`Coding vs Epilepsy`
[1] 6.97466e-05
$`CMAR and Common vs Common`
[1] 2.191874e-06
$`Common vs Coding`
[1] 0.5104602
$`CMAR and common vs Common and Coding`
[1] 0.08744431
$`Common vs Common and Coding`
[1] 2.504464e-07
$`Full vs Common and Coding`
[1] 0.0170021
Lets see how common and rare cardiac vars interact
# Plot
dot_plot <- ggplot(categorized_combined_data, aes(x = rank_cardiac, y = rank_common, color = Category)) +
geom_point(alpha = 0.7) +
scale_color_manual(values = c("1" = "blue", "2-3" = "green", "4-6" = "red")) +
labs(x = "Cardiac Variant Score Rank", y = "Common Variant Score Rank", color = "Arrest Status") +
theme_cowplot(12) +
theme(strip.text.x = element_text(size = 10))
# Plot
dot_plot <- ggplot(categorized_combined_data, aes(x = rank_cardiac, y = rank_common, color = Category)) +
geom_point(alpha = 1) +
scale_color_manual(values = group_colors) +
labs(x = "Cardiac Variant Score Rank", y = "Common Variant Score Rank", color = "Arrest Status") +
theme_cowplot(12) +
theme(strip.text.x = element_text(size = 10))
dot_plot
median_cardiac <- median(categorized_combined_data$rank_cardiac, na.rm = TRUE)
median_common <- median(categorized_combined_data$rank_common, na.rm = TRUE)
# Filter for the top half
quadrant1 <- categorized_combined_data %>%
filter(rank_cardiac <= median_cardiac & rank_common <= median_common)
quadrant2 <- categorized_combined_data %>%
filter(rank_cardiac < median_cardiac & rank_common > median_common)
quadrant3 <- categorized_combined_data %>%
filter(rank_cardiac > median_cardiac & rank_common < median_common)
quadrant4 <- categorized_combined_data %>%
filter(rank_cardiac >= median_cardiac & rank_common >= median_common)
# Combine into one data frame
combined_quadrants <- bind_rows(
quadrant1 %>% mutate(Quadrant = 'Quadrant 1'),
quadrant2 %>% mutate(Quadrant = 'Quadrant 2'),
quadrant3 %>% mutate(Quadrant = 'Quadrant 3'),
quadrant4 %>% mutate(Quadrant = 'Quadrant 4')
)
# Calculate the count of individuals in each quadrant for each category
# and the total count of individuals in each category
percentage_by_category <- combined_quadrants %>%
group_by(Category, Quadrant) %>%
summarise(Count = n(), .groups = 'drop') %>%
group_by(Category) %>%
mutate(Total = sum(Count), Percentage = Count / Total * 100)
# View the result
percentage_by_category
# A tibble: 8 × 5
# Groups: Category [2]
Category Quadrant Count Total Percentage
<fct> <chr> <int> <int> <dbl>
1 Control Quadrant 1 210 537 39.1
2 Control Quadrant 2 117 537 21.8
3 Control Quadrant 3 112 537 20.9
4 Control Quadrant 4 98 537 18.2
5 zCase Quadrant 1 99 506 19.6
6 zCase Quadrant 2 95 506 18.8
7 zCase Quadrant 3 101 506 20.0
8 zCase Quadrant 4 211 506 41.7
# Calculate overall proportions for each quadrant
overall_counts <- combined_quadrants %>%
count(Quadrant) %>%
mutate(OverallProportion = n / sum(n))
# Calculate total counts by category for scaling expected values
category_totals <- combined_quadrants %>%
count(Category) %>%
dplyr::rename(TotalInCategory = n)
# Calculate expected counts
expected_counts <- combined_quadrants %>%
dplyr::select(Category, Quadrant) %>%
distinct() %>%
left_join(overall_counts, by = "Quadrant") %>%
left_join(category_totals, by = "Category") %>%
mutate(Expected = OverallProportion * TotalInCategory)
# Calculate observed counts
observed_counts <- combined_quadrants %>%
count(Category, Quadrant) %>%
dplyr::rename(Observed = n)
# combine
combined_for_plotting <- combined_quadrants %>%
count(Category, Quadrant) %>%
dplyr::rename(Observed = n) %>%
left_join(overall_counts, by = "Quadrant") %>%
left_join(category_totals, by = "Category") %>%
mutate(Expected = OverallProportion * TotalInCategory, Ratio = Observed / Expected)
# Calculate the Expected/Observed ratio
combined_for_plotting <- combined_for_plotting %>%
mutate(Ratio = Observed / Expected)
# Plot the Expected/Observed ratio
quadrant_plot <- ggplot(combined_for_plotting, aes(x = Quadrant, y = Ratio, fill = Category)) +
geom_bar(stat = "identity", position = position_dodge()) +
labs(y = "Expected/Observed Ratio", x = "Quadrant", title = "Expected/Observed Ratios by Category and Quadrant") +
theme_cowplot(12) +
scale_fill_manual(values = group_colors) +
geom_hline(yintercept = 1, linetype = "dashed", color = "Black")
quadrant_plot
# Create the contingency table
contingency_table <- xtabs(Observed ~ Category + Quadrant, data = observed_counts)
# Print the contingency table
print(contingency_table)
Quadrant
Category Quadrant 1 Quadrant 2 Quadrant 3 Quadrant 4
Control 210 117 112 98
zCase 99 95 101 211
# Chi-squared test
chi_squared_result <- chisq.test(contingency_table)
# Print the results of the chi-squared test
print(chi_squared_result)
Pearson's Chi-squared test
data: contingency_table
X-squared = 83.201, df = 3, p-value < 2.2e-16
# Filter the data for cardiac_score_quintiles > 3
top_3_quintiles <- categorized_combined_data %>%
filter(cardiac_score_quintiles > 3)
common_density_score_plot <- ggplot(top_3_quintiles, aes(x = cardiac_variant_score, y = common_variant_score)) +
geom_point(aes(color = Category), alpha = 0.5) +
geom_smooth(method = "lm", aes(group = Category, color = Category)) +
labs(x = "Cardiac Variant Score", y = "Common Variant Score", title = "Cardiac Variant Score vs. Common Variant Score by Category") +
scale_color_manual(values = group_colors) +
theme_cowplot(12)
# Highlight the specific point in red
common_density_score_plot <- common_density_score_plot +
geom_point(data = subset(top_3_quintiles, ID == "CGM0000563"), aes(x = cardiac_variant_score, y = common_variant_score), color = "red")
# Print
print(common_density_score_plot)
`geom_smooth()` using formula = 'y ~ x'
#Do the stats
model_common_cardiac <- lm(common_variant_score ~ cardiac_variant_score * Category, data = top_3_quintiles)
summary(model_common_cardiac)
Call:
lm(formula = common_variant_score ~ cardiac_variant_score * Category,
data = top_3_quintiles)
Residuals:
Min 1Q Median 3Q Max
-2.97751 -0.32432 0.02594 0.40306 1.52690
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.14428 0.05354 -2.695 0.00733 **
cardiac_variant_score -0.17503 0.08772 -1.995 0.04667 *
CategoryzCase 0.36154 0.06888 5.249 2.45e-07 ***
cardiac_variant_score:CategoryzCase 0.17451 0.08979 1.943 0.05264 .
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.6156 on 414 degrees of freedom
Multiple R-squared: 0.09912, Adjusted R-squared: 0.0926
F-statistic: 15.18 on 3 and 414 DF, p-value: 2.158e-09
epilepsy_density_score_plot <- ggplot(top_3_quintiles, aes(x = cardiac_variant_score, y = epilepsy_variant_score)) +
geom_point(aes(color = Category), alpha = 0.5) +
geom_smooth(method = "lm", aes(group = Category, color = Category)) +
labs(x = "Cardiac Variant Score", y = "Epilepsy Variant Score", title = "Cardiac Variant Score vs. Epilepsy Variant Score by Category") +
scale_color_manual(values = group_colors) +
theme_cowplot(12)
# Highlight the specific point in red
epilepsy_density_score_plot <- epilepsy_density_score_plot +
geom_point(data = subset(top_3_quintiles, ID == "CGM0000563"), aes(x = cardiac_variant_score, y = epilepsy_variant_score), color = "red")
# Print
print(epilepsy_density_score_plot)
`geom_smooth()` using formula = 'y ~ x'
#Do the stats
model_epilepsy_cardiac <- lm(epilepsy_variant_score ~ cardiac_variant_score * Category, data = top_3_quintiles)
summary(model_epilepsy_cardiac)
Call:
lm(formula = epilepsy_variant_score ~ cardiac_variant_score *
Category, data = top_3_quintiles)
Residuals:
Min 1Q Median 3Q Max
-4.2366 -0.3230 -0.0335 0.2956 10.1092
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.04534 0.07551 -0.600 0.5485
cardiac_variant_score -0.12827 0.12372 -1.037 0.3004
CategoryzCase -0.21357 0.09714 -2.199 0.0285 *
cardiac_variant_score:CategoryzCase 0.71880 0.12664 5.676 2.6e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.8682 on 414 degrees of freedom
Multiple R-squared: 0.5496, Adjusted R-squared: 0.5463
F-statistic: 168.4 on 3 and 414 DF, p-value: < 2.2e-16
# do wilcox_test
CMARR_epilepsy_wilcox_test_results <- wilcox.test(epilepsy_variant_score ~ Category,
data = top_3_quintiles %>% filter(Category %in% c("zCase", "Control")))
CMARR_epilepsy_wilcox_test_results
Wilcoxon rank sum test with continuity correction
data: epilepsy_variant_score by Category
W = 13768, p-value = 2.933e-08
alternative hypothesis: true location shift is not equal to 0
CMARR_common_wilcox_test_results <- wilcox.test(common_variant_score ~ Category,
data = top_3_quintiles %>% filter(Category %in% c("zCase", "Control")))
CMARR_common_wilcox_test_results
Wilcoxon rank sum test with continuity correction
data: common_variant_score by Category
W = 13423, p-value = 5.446e-09
alternative hypothesis: true location shift is not equal to 0
no_PLPs_categorized_data <- categorized_combined_data %>%
filter(PLP_Cardiomyopathy < 1 & probability > 0.7)
# Count the number of rows where arrest_group equals 1 (Controls)
paste("number of PLP negative DISCOVERY controls over the threshold")
[1] "number of PLP negative DISCOVERY controls over the threshold"
nrow(no_PLPs_categorized_data %>% filter(arrest_group == 1))
[1] 10
# Count the number of rows where arrest_group equals > 1 (Cases)
paste("number of PLP negative DISCOVERY cases over the threshold")
[1] "number of PLP negative DISCOVERY cases over the threshold"
nrow(no_PLPs_categorized_data %>% filter(arrest_group > 1))
[1] 57
#for replication
no_PLPs_categorized_replication_data <- categorized_replication_data %>%
filter(PLP_Cardiomyopathy < 1 & probability > 0.7)
# Count the number of rows where arrest_group equals 1 (Controls)
paste("number of PLP negative REPLICATION controls over the threshold")
[1] "number of PLP negative REPLICATION controls over the threshold"
nrow(no_PLPs_categorized_replication_data %>% filter(arrest_group == 1))
[1] 0
# Count the number of rows where arrest_group equals > 1 (Cases)
paste("number of PLP negative REPLICATION cases over the threshold")
[1] "number of PLP negative REPLICATION cases over the threshold"
nrow(no_PLPs_categorized_replication_data %>% filter(arrest_group > 1))
[1] 27
NOW, retrain the model on the validation cohort and apply it to the discovery cohort
model_replication <- glm(Category ~ Normalized_Heart_rate + Normalized_PR_interval + Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF + Normalized_LVESV + Normalized_SVI + Normalized_Afib + Epilepsy_rare + Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare + PLP_Epilepsy + PLP_Cardiomyopathy + + Cardiomyopathy_noncoding_rare + Epilepsy_noncoding_rare + Cardiomyopathy_null + Epilepsy_null,
data = categorized_replication_data, family = binomial())
Warning: glm.fit: algorithm did not converge
Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model_replication
Call: glm(formula = Category ~ Normalized_Heart_rate + Normalized_PR_interval +
Normalized_QTc + Normalized_BP + Normalized_QRS + Normalized_LVEF +
Normalized_LVESV + Normalized_SVI + Normalized_Afib + Epilepsy_rare +
Epilepsy_ultrarare + Cardiomyopathy_rare + Cardiomyopathy_ultrarare +
PLP_Epilepsy + PLP_Cardiomyopathy + +Cardiomyopathy_noncoding_rare +
Epilepsy_noncoding_rare + Cardiomyopathy_null + Epilepsy_null,
family = binomial(), data = categorized_replication_data)
Coefficients:
(Intercept) Normalized_Heart_rate
-32.4336 -1.9711
Normalized_PR_interval Normalized_QTc
-3.5941 -13.7152
Normalized_BP Normalized_QRS
-2.1713 9.7180
Normalized_LVEF Normalized_LVESV
-5.8998 -23.3856
Normalized_SVI Normalized_Afib
2.4789 8.0705
Epilepsy_rare Epilepsy_ultrarare
-6.1431 -64.2490
Cardiomyopathy_rare Cardiomyopathy_ultrarare
-6.8631 -4.2113
PLP_Epilepsy PLP_Cardiomyopathy
-32.7781 9.8564
Cardiomyopathy_noncoding_rare Epilepsy_noncoding_rare
0.6944 3.6171
Cardiomyopathy_null Epilepsy_null
-8.5436 -32.4137
Degrees of Freedom: 125 Total (i.e. Null); 106 Residual
Null Deviance: 174.2
Residual Deviance: 1.524e-08 AIC: 40
scores_replication_model_on_discovery <- predict(model_replication, newdata = categorized_combined_data, type = "link")
# Add scores to the dataframes
categorized_combined_data$scores_replication_model_on_discovery <- scores_replication_model_on_discovery
# z normalize
mean_replication_on_discovery <- mean(categorized_combined_data$scores_replication_model_on_discovery)
sd_replication_on_discovery <- sd(categorized_combined_data$scores_replication_model_on_discovery)
scores_replication_model_on_discovery_plot <- ggplot(categorized_combined_data, aes(x = Category, y = (scores_replication_model_on_discovery - mean_replication_on_discovery)/sd_replication_on_discovery, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
scores_replication_model_on_discovery_plot
Warning: Removed 6 rows containing non-finite outside the scale range
(`stat_boxplot()`).
# wilcox.test
wilcox.test_result_replication_on_discovery <- wilcox.test(scores_replication_model_on_discovery ~ Category, data = categorized_combined_data)
# View the results
print(wilcox.test_result_replication_on_discovery)
Wilcoxon rank sum test with continuity correction
data: scores_replication_model_on_discovery by Category
W = 118506, p-value = 0.0003579
alternative hypothesis: true location shift is not equal to 0
Replication on replication
scores_replication_model_on_replication <- predict(model_replication, newdata = categorized_replication_data, type = "link")
# Add scores to the dataframes
categorized_replication_data$scores_replication_model_on_replication <- scores_replication_model_on_replication
# z normalize
mean_replication_on_replication <- mean(categorized_replication_data$scores_replication_model_on_replication)
sd_replication_on_replication <- sd(categorized_replication_data$scores_replication_model_on_replication)
scores_replication_model_on_replication_plot <- ggplot(categorized_replication_data, aes(x = Category, y = (scores_replication_model_on_replication - mean_replication_on_replication)/sd_replication_on_replication, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
scale_fill_manual(values = group_colors) +
ylim(-2.5, 2.5) +
theme_cowplot(12)
scores_replication_model_on_replication_plot
Warning: Removed 2 rows containing non-finite outside the scale range
(`stat_boxplot()`).
# t-test
wilcox.test_replication_on_replication <- wilcox.test(scores_replication_model_on_replication ~ Category, data = categorized_replication_data)
# View the results
print(wilcox.test_replication_on_replication)
Wilcoxon rank sum test with continuity correction
data: scores_replication_model_on_replication by Category
W = 0, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0
NOW DO AUC FOR ORIGINAL DATA USING REPLICATION COEFFICIEANTS
categorized_combined_data <- categorized_combined_data %>%
mutate(
probability_replication_on_discovery = plogis(scores_replication_model_on_discovery),
)
# Convert 'Category' into a binary format: 0 for control (1), 1 for case (2-4)
categorized_combined_data$binary_outcome <- ifelse(categorized_combined_data$Category == "Control", 0, 1)
roc_result_full <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$probability_replication_on_discovery, plot = TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
auc(roc_result_full)
Area under the curve: 0.5554
Replication model on discovery data Odds Ratio
categorized_combined_data <- categorized_combined_data %>%
mutate(
replication_model_on_discovery_rank = rank(scores_replication_model_on_discovery, ties.method = "first"),
replication_model_on_discovery_score_quintiles = ceiling(replication_model_on_discovery_rank / (n() / 5)),
)
# Apply function to each odds category
replication_model_on_discovery_combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, replication_model_on_discovery_score_quintiles)
plot(log2(replication_model_on_discovery_combined_odds_summary$odds_ratio),
ylim = c(-2,2
),
pch = 19, xlab = "quintile", ylab = "Log2(Odds Ratio)",
main = "Log2(Odds Ratio) Across quintiles of Score with 95% CI",
xaxt = "n", col = "#3C8C78")
lines(1:length(replication_model_on_discovery_combined_odds_summary$odds_ratio), log2(replication_model_on_discovery_combined_odds_summary$odds_ratio), col = "#3C8C78")
# Add error bars for 95% CI - Combined
arrows(x0 = 1:length(replication_model_on_discovery_combined_odds_summary$odds_ratio),
y0 = log2(replication_model_on_discovery_combined_odds_summary$lower_ci),
y1 = log2(replication_model_on_discovery_combined_odds_summary$upper_ci),
code = 3, angle = 90, length = 0.05, col = "#3C8C78")
ANCESTRY
# Filter the data frame to include only rows where Category == 1
filtered_categorized_combined_data <- subset(categorized_combined_data, Category == "Control")
# Run the regression of scores on C1 to C10
ancestry_regression_model <- lm(scores ~ PC1 + PC2 + PC3 + PC4 + PC5 + PC6 + PC7 + PC8 + PC9 + PC10 +
PC11 + PC12 + PC13 + PC14 + PC15 + PC16 + PC17 + PC18 + PC19 + PC20,
data = filtered_categorized_combined_data)
# Get the coefficients from the regression
ancestry_coefficients <- coef(ancestry_regression_model)
# Calculate the adjusted_scores by removing the effects of C1 to C10
categorized_combined_data$adjusted_scores <- categorized_combined_data$scores -
(ancestry_coefficients["(Intercept)"] +
categorized_combined_data$PC1 * ancestry_coefficients["PC1"] +
categorized_combined_data$PC2 * ancestry_coefficients["PC2"] +
categorized_combined_data$PC3 * ancestry_coefficients["PC3"] +
categorized_combined_data$PC4 * ancestry_coefficients["PC4"] +
categorized_combined_data$PC5 * ancestry_coefficients["PC5"] +
categorized_combined_data$PC6 * ancestry_coefficients["PC6"] +
categorized_combined_data$PC7 * ancestry_coefficients["PC7"] +
categorized_combined_data$PC8 * ancestry_coefficients["PC8"] +
categorized_combined_data$PC9 * ancestry_coefficients["PC9"] +
categorized_combined_data$PC10 * ancestry_coefficients["PC10"] +
categorized_combined_data$PC11 * ancestry_coefficients["PC11"] +
categorized_combined_data$PC12 * ancestry_coefficients["PC12"] +
categorized_combined_data$PC13 * ancestry_coefficients["PC13"] +
categorized_combined_data$PC14 * ancestry_coefficients["PC14"] +
categorized_combined_data$PC15 * ancestry_coefficients["PC15"] +
categorized_combined_data$PC16 * ancestry_coefficients["PC16"] +
categorized_combined_data$PC17 * ancestry_coefficients["PC17"] +
categorized_combined_data$PC18 * ancestry_coefficients["PC18"] +
categorized_combined_data$PC19 * ancestry_coefficients["PC19"] +
categorized_combined_data$PC20 * ancestry_coefficients["PC20"])
# Calculate the adjusted probabilities by removing the effects of C1 to C10
categorized_combined_data <- categorized_combined_data %>%
mutate(adjusted_probability = plogis(adjusted_scores))
# Extract PCA columns (PC1 to PC20)
pca_columns <- categorized_combined_data[, grep("^PC\\d+$", names(categorized_combined_data))]
# Perform GMM clustering
gmm_result <- Mclust(pca_columns, G = 5)
# Add the cluster assignments to the original dataframe
categorized_combined_data$cluster_gmm <- as.factor(gmm_result$classification)
# Visualize the clustering results
ggplot(categorized_combined_data, aes(x = PC1, y = PC2, color = cluster_gmm)) +
geom_point() +
ggtitle("GMM Clustering of PCA Data") +
theme_minimal() +
scale_color_manual(values = rainbow(length(unique(gmm_result$classification)))) +
theme(legend.position = "bottom")
contingency_table <- table(categorized_combined_data$cluster_gmm, categorized_combined_data$Ancestry)
contingency_df <- as.data.frame(contingency_table)
colnames(contingency_df) <- c("Cluster", "Ancestry", "Count")
# Bar plot to show the makeup of each cluster by ancestry
ggplot(contingency_df, aes(x = Cluster, y = Count, fill = Ancestry)) +
geom_bar(stat = "identity", position = "dodge") +
ggtitle("Makeup of Each Cluster by Ancestry") +
xlab("Cluster") + ylab("Count") +
theme_minimal() +
theme(legend.position = "bottom")
# Define the mapping of clusters to ancestry
cluster_to_ancestry <- c("1" = "EUR", "2" = "HIS", "4" = "AFR", "3" = "OTH", "5" = "OTH")
# Add the new column to the dataframe
categorized_combined_data$cluster_gmm_Ancestry <- as.character(categorized_combined_data$cluster_gmm)
# Reassign the clusters based on the mapping
categorized_combined_data$cluster_gmm_Ancestry <- sapply(categorized_combined_data$cluster_gmm, function(x) cluster_to_ancestry[as.character(x)])
# Convert the new column to a factor
categorized_combined_data$cluster_gmm_Ancestry <- factor(categorized_combined_data$cluster_gmm_Ancestry, levels = unique(cluster_to_ancestry))
# Create a summary dataframe with counts
summary_data <- categorized_combined_data %>%
group_by(Category, cluster_gmm_Ancestry) %>%
summarise(count = n()) %>%
ungroup()
`summarise()` has grouped output by 'Category'. You can override using the
`.groups` argument.
# Define the Wes Anderson palette
palette_colors <- wes_palette("FantasticFox1", length(unique(categorized_combined_data$cluster_gmm_Ancestry)), type = "continuous")
# Create a bar plot with percentages and counts
ancestry_makeup <- ggplot(categorized_combined_data, aes(x = Category, fill = cluster_gmm_Ancestry)) +
geom_bar(position = "fill") +
scale_y_continuous(labels = scales::percent) +
ggtitle("Makeup of cluster_gmm_Ancestry within Each Category") +
xlab("Category") +
ylab("Percentage") +
theme(legend.position = "bottom") +
scale_fill_manual(values = palette_colors) +
geom_text(data = summary_data, aes(label = count, y = count / sum(count), group = cluster_gmm_Ancestry),
position = position_fill(vjust = 0.5), color = "black") +
theme_cowplot(12)
# Visualize the clustering results
clusters <- ggplot(categorized_combined_data, aes(x = PC1, y = PC2, color = cluster_gmm_Ancestry)) +
geom_point() +
ggtitle("GMM Clustering of PCA Data") +
theme_minimal() +
scale_color_manual(values = palette_colors) +
theme(legend.position = "bottom") +
theme_cowplot(12)
ancestry_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = scores, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Scores by cluster_gmm_Ancestry and Category",
x = "cluster_gmm_Ancestry",
y = "Scores") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom") +
theme_cowplot(12) +
ylim(-3, 3)
ancestry_adjusted_scores <- ggplot(categorized_combined_data, aes(x = cluster_gmm_Ancestry, y = adjusted_scores, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Adjusted Scores by cluster_gmm_Ancestry and Category",
x = "cluster_gmm_Ancestry",
y = "Adjusted Scores") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom")+
theme_cowplot(12) +
ylim(-3, 3)
ancestry_adjusted__total_scores <- ggplot(categorized_combined_data, aes(x = Category, y = adjusted_scores, fill = Category)) +
geom_boxplot(outlier.shape = NA, notch = TRUE) +
labs(title = "Adjusted Scores by cluster_gmm_Ancestry and Category",
x = "cluster_gmm_Ancestry",
y = "Adjusted Scores") +
scale_fill_manual(values = group_colors) +
theme(legend.position = "bottom")+
theme_cowplot(12) +
ylim(-3, 3)
clusters
ancestry_makeup
suppressWarnings(print(ancestry_scores))
suppressWarnings(print(ancestry_adjusted_scores))
suppressWarnings(print(ancestry_adjusted__total_scores))
wilcox.test(adjusted_scores ~ Category, data = categorized_combined_data)
Wilcoxon rank sum test with continuity correction
data: adjusted_scores by Category
W = 92675, p-value < 2.2e-16
alternative hypothesis: true location shift is not equal to 0
Odds ratios in the highest quintile across ancestry, adjusted
categorized_combined_data <- categorized_combined_data %>%
mutate(
rank_adjusted = rank(adjusted_scores, ties.method = "first"),
score_adjusted_quintiles = ceiling(rank_adjusted / (n() / 5))
)
#subset by ancestry
AFR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "AFR", ]
HIS <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "HIS", ]
EUR <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "EUR", ]
OTH <- categorized_combined_data[categorized_combined_data$cluster_gmm_Ancestry == "OTH", ]
# Apply the OR funciton you made way earlier
combined_odds_summary = calculate_odds_ratios(categorized_combined_data, Category, score_quintiles)
combined_odds_summary_AFR = calculate_odds_ratios(AFR, Category, score_quintiles)
combined_odds_summary_HIS = calculate_odds_ratios(HIS, Category, score_quintiles)
combined_odds_summary_EUR = calculate_odds_ratios(EUR, Category, score_quintiles)
combined_odds_summary_OTH = calculate_odds_ratios(OTH, Category, score_quintiles)
combined_odds_summary
# A tibble: 5 × 8
score_quintiles count_category_1 count_category_2_6 odds odds_ratio
<dbl> <int> <int> <dbl> <dbl>
1 1 160 48 0.3 1
2 2 137 72 0.526 1.75
3 3 112 96 0.857 2.86
4 4 96 113 1.18 3.92
5 5 32 177 5.53 18.4
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_AFR
# A tibble: 5 × 8
score_quintiles count_category_1 count_category_2_6 odds odds_ratio
<dbl> <int> <int> <dbl> <dbl>
1 1 77 27 0.351 1
2 2 51 22 0.431 1.23
3 3 42 22 0.524 1.49
4 4 22 15 0.682 1.94
5 5 5 35 7 20.0
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_HIS
# A tibble: 5 × 8
score_quintiles count_category_1 count_category_2_6 odds odds_ratio
<dbl> <int> <int> <dbl> <dbl>
1 1 74 11 0.149 1
2 2 47 12 0.255 1.72
3 3 29 15 0.517 3.48
4 4 37 11 0.297 2
5 5 13 9 0.692 4.66
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_EUR
# A tibble: 5 × 8
score_quintiles count_category_1 count_category_2_6 odds odds_ratio
<dbl> <int> <int> <dbl> <dbl>
1 1 8 5 0.625 1
2 2 28 31 1.11 1.77
3 3 33 46 1.39 2.23
4 4 34 76 2.24 3.58
5 5 13 119 9.15 14.6
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_OTH
# A tibble: 5 × 8
score_quintiles count_category_1 count_category_2_6 odds odds_ratio
<dbl> <int> <int> <dbl> <dbl>
1 1 1 5 5 1
2 2 11 7 0.636 0.127
3 3 8 13 1.62 0.325
4 4 3 11 3.67 0.733
5 5 1 14 14 2.8
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
# Apply the OR funciton you made way earlier
combined_odds_summary_adjust= calculate_odds_ratios(categorized_combined_data, Category, score_adjusted_quintiles)
combined_odds_summary_AFR_adjust = calculate_odds_ratios(AFR, Category, score_adjusted_quintiles)
combined_odds_summary_HIS_adjust = calculate_odds_ratios(HIS, Category, score_adjusted_quintiles)
combined_odds_summary_EUR_adjust = calculate_odds_ratios(EUR, Category, score_adjusted_quintiles)
combined_odds_summary_OTH_adjust = calculate_odds_ratios(OTH, Category, score_adjusted_quintiles)
combined_odds_summary_adjust
# A tibble: 5 × 8
score_adjusted_quintiles count_category_1 count_category_2_6 odds odds_ratio
<dbl> <int> <int> <dbl> <dbl>
1 1 134 74 0.552 1
2 2 132 77 0.583 1.06
3 3 115 93 0.809 1.46
4 4 106 103 0.972 1.76
5 5 50 159 3.18 5.76
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_AFR_adjust
# A tibble: 5 × 8
score_adjusted_quintiles count_category_1 count_category_2_6 odds odds_ratio
<dbl> <int> <int> <dbl> <dbl>
1 1 51 24 0.471 1
2 2 47 14 0.298 0.633
3 3 43 26 0.605 1.28
4 4 45 26 0.578 1.23
5 5 11 31 2.82 5.99
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_HIS_adjust
# A tibble: 5 × 8
score_adjusted_quintiles count_category_1 count_category_2_6 odds odds_ratio
<dbl> <int> <int> <dbl> <dbl>
1 1 54 8 0.148 1
2 2 48 11 0.229 1.55
3 3 29 14 0.483 3.26
4 4 41 12 0.293 1.98
5 5 28 13 0.464 3.13
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_EUR_adjust
# A tibble: 5 × 8
score_adjusted_quintiles count_category_1 count_category_2_6 odds odds_ratio
<dbl> <int> <int> <dbl> <dbl>
1 1 26 34 1.31 1
2 2 31 45 1.45 1.11
3 3 34 46 1.35 1.03
4 4 15 54 3.6 2.75
5 5 10 98 9.8 7.49
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
combined_odds_summary_OTH_adjust
# A tibble: 5 × 8
score_adjusted_quintiles count_category_1 count_category_2_6 odds odds_ratio
<dbl> <int> <int> <dbl> <dbl>
1 1 3 8 2.67 1
2 2 6 7 1.17 0.438
3 3 9 7 0.778 0.292
4 4 5 11 2.2 0.825
5 5 1 17 17 6.38
# ℹ 3 more variables: se_log_odds_ratio <dbl>, lower_ci <dbl>, upper_ci <dbl>
# Extracting quintile 5 data for each summary
quintile_5_data <- data.frame(
group = c("Total","AFR", "HIS", "EUR", "OTH"),
odds_ratio = c(combined_odds_summary$odds_ratio[5],
combined_odds_summary_AFR$odds_ratio[5],
combined_odds_summary_HIS$odds_ratio[5],
combined_odds_summary_EUR$odds_ratio[5],
combined_odds_summary_OTH$odds_ratio[5]),
lower_ci = c(combined_odds_summary$lower_ci[5],
combined_odds_summary_AFR$lower_ci[5],
combined_odds_summary_HIS$lower_ci[5],
combined_odds_summary_EUR$lower_ci[5],
combined_odds_summary_OTH$lower_ci[5]),
upper_ci = c(combined_odds_summary$upper_ci[5],
combined_odds_summary_AFR$upper_ci[5],
combined_odds_summary_HIS$upper_ci[5],
combined_odds_summary_EUR$upper_ci[5],
combined_odds_summary_OTH$upper_ci[5])
)
# Log2 transformation
quintile_5_data$log2_odds_ratio <- log2(quintile_5_data$odds_ratio)
quintile_5_data$log2_lower_ci <- log2(quintile_5_data$lower_ci)
quintile_5_data$log2_upper_ci <- log2(quintile_5_data$upper_ci)
# Define the y-limits based on the transformed data
ylim_range <- range(-2, 6)
# Plotting
plot(quintile_5_data$log2_odds_ratio, ylim = ylim_range, pch = 19, xlab = "Group", ylab = "Log2(Odds Ratio)",
main = "Log2(Odds Ratio) for Quintile 5 Across Different Groups", xaxt = "n", col = "#3C8474")
axis(1, at = 1:5, labels = quintile_5_data$group)
# Add error bars for 95% CI
arrows(x0 = 1:5, y0 = quintile_5_data$log2_lower_ci, y1 = quintile_5_data$log2_upper_ci,
code = 3, angle = 90, length = 0.05, col = "#3C8C78")
# Extract quintile 5 data for each summary
quintile_5_data <- data.frame(
group = c("Total","AFR", "HIS", "EUR", "OTH"),
odds_ratio = c(combined_odds_summary_adjust$odds_ratio[5],
combined_odds_summary_AFR_adjust$odds_ratio[5],
combined_odds_summary_HIS_adjust$odds_ratio[5],
combined_odds_summary_EUR_adjust$odds_ratio[5],
combined_odds_summary_OTH_adjust$odds_ratio[5]),
lower_ci = c(combined_odds_summary_adjust$lower_ci[5],
combined_odds_summary_AFR_adjust$lower_ci[5],
combined_odds_summary_HIS_adjust$lower_ci[5],
combined_odds_summary_EUR_adjust$lower_ci[5],
combined_odds_summary_OTH_adjust$lower_ci[5]),
upper_ci = c(combined_odds_summary_adjust$upper_ci[5],
combined_odds_summary_AFR_adjust$upper_ci[5],
combined_odds_summary_HIS_adjust$upper_ci[5],
combined_odds_summary_EUR_adjust$upper_ci[5],
combined_odds_summary_OTH_adjust$upper_ci[5])
)
# Log2 transformation
quintile_5_data$log2_odds_ratio <- log2(quintile_5_data$odds_ratio)
quintile_5_data$log2_lower_ci <- log2(quintile_5_data$lower_ci)
quintile_5_data$log2_upper_ci <- log2(quintile_5_data$upper_ci)
# Define the y-limits based on the transformed data
ylim_range <- range(-2, 6)
# Plot
plot(quintile_5_data$log2_odds_ratio, ylim = ylim_range, pch = 19, xlab = "Group", ylab = "Log2(Odds Ratio)",
main = "Log2(Odds Ratio) for Quintile 5 Across Different Groups, adjusted", xaxt = "n", col = "black")
axis(1, at = 1:5, labels = quintile_5_data$group)
# Add error bars for 95% CI
arrows(x0 = 1:5, y0 = quintile_5_data$log2_lower_ci, y1 = quintile_5_data$log2_upper_ci,
code = 3, angle = 90, length = 0.05, col = "black")
# Convert 'Category' into a binary format: 0 for control (1), 1 for case (2-4)
categorized_combined_data$binary_outcome <- ifelse(categorized_combined_data$Category == "Control", 0, 1)
roc_result_full_adjusted <- roc(categorized_combined_data$binary_outcome, categorized_combined_data$adjusted_probability, plot = TRUE)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
auc(roc_result_full_adjusted)
Area under the curve: 0.6589
# Calculate the adjusted probabilities by removing the effects of C1 to C10
categorized_replication_data$adjusted_scores_replication <- categorized_replication_data$scores_replication -
(ancestry_coefficients["(Intercept)"] +
categorized_replication_data$PC1 * ancestry_coefficients["PC1"] +
categorized_replication_data$PC2 * ancestry_coefficients["PC2"] +
categorized_replication_data$PC3 * ancestry_coefficients["PC3"] +
categorized_replication_data$PC4 * ancestry_coefficients["PC4"] +
categorized_replication_data$PC5 * ancestry_coefficients["PC5"] +
categorized_replication_data$PC6 * ancestry_coefficients["PC6"] +
categorized_replication_data$PC7 * ancestry_coefficients["PC7"] +
categorized_replication_data$PC8 * ancestry_coefficients["PC8"] +
categorized_replication_data$PC9 * ancestry_coefficients["PC9"] +
categorized_replication_data$PC10 * ancestry_coefficients["PC10"] +
categorized_replication_data$PC11 * ancestry_coefficients["PC11"] +
categorized_replication_data$PC12 * ancestry_coefficients["PC12"] +
categorized_replication_data$PC13 * ancestry_coefficients["PC13"] +
categorized_replication_data$PC14 * ancestry_coefficients["PC14"] +
categorized_replication_data$PC15 * ancestry_coefficients["PC15"] +
categorized_replication_data$PC16 * ancestry_coefficients["PC16"] +
categorized_replication_data$PC17 * ancestry_coefficients["PC17"] +
categorized_replication_data$PC18 * ancestry_coefficients["PC18"] +
categorized_replication_data$PC19 * ancestry_coefficients["PC19"] +
categorized_replication_data$PC20 * ancestry_coefficients["PC20"])
# Calculate the adjusted probabilities by removing the effects of C1 to C10
categorized_replication_data <- categorized_replication_data %>%
mutate(adjusted_probability_replication = plogis(adjusted_scores_replication))
# Compute the ROC curve
roc_result_full_adjusted <- roc(categorized_replication_data$binary_outcome, categorized_replication_data$adjusted_probability_replication)
Setting levels: control = 0, case = 1
Setting direction: controls < cases
auc(roc_result_full_adjusted)
Area under the curve: 0.8287
# Plot the ROC curve with the x-axis reversed
plot(roc_result_full_adjusted,xlim = c(1, 0))